diff --git a/lib/kokkos/Copyright.txt b/lib/kokkos/Copyright.txt
new file mode 100755
index 0000000000000000000000000000000000000000..05980758fa8fe6317bb08fcc6eb70668b5fd1580
--- /dev/null
+++ b/lib/kokkos/Copyright.txt
@@ -0,0 +1,40 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
diff --git a/lib/kokkos/LICENSE b/lib/kokkos/LICENSE
new file mode 100755
index 0000000000000000000000000000000000000000..05980758fa8fe6317bb08fcc6eb70668b5fd1580
--- /dev/null
+++ b/lib/kokkos/LICENSE
@@ -0,0 +1,40 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos
new file mode 100755
index 0000000000000000000000000000000000000000..473039af52bb23ebcb68b5b7494a0c3625b92154
--- /dev/null
+++ b/lib/kokkos/Makefile.kokkos
@@ -0,0 +1,318 @@
+# Default settings common options
+
+KOKKOS_PATH=../../lib/kokkos
+
+#Options: OpenMP,Serial,Pthreads,Cuda
+KOKKOS_DEVICES ?= "OpenMP"
+#KOKKOS_DEVICES ?= "Pthreads"
+#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8
+KOKKOS_ARCH ?= ""
+#Options: yes,no
+KOKKOS_DEBUG ?= "no"
+#Options: hwloc,librt
+KOKKOS_USE_TPLS ?= ""
+
+#Default settings specific options
+#Options: force_uvm,use_ldg,rdc
+KOKKOS_CUDA_OPTIONS ?= ""
+
+# Check for general settings
+
+KOKKOS_CXX_STANDARD ?= "c++11"
+
+KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l))
+KOKKOS_INTERNAL_ENABLE_PROFILING_COLLECT_KERNEL_DATA := $(strip $(shell echo $(KOKKOS_PROFILING) | grep "kernel_times" | wc -l))
+KOKKOS_INTERNAL_ENABLE_PROFILING_AGGREGATE_MPI := $(strip $(shell echo $(KOKKOS_PROFILING) | grep "aggregate_mpi" | wc -l))
+KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l))
+
+# Check for external libraries
+KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
+KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l))
+
+# Check for advanced settings
+KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l))
+KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "force_uvm" | wc -l))
+KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
+
+# Check for Kokkos Host Execution Spaces one of which must be on
+
+KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMP | wc -l))
+KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l))
+KOKKOS_INTERNAL_USE_SERIAL := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Serial | wc -l))
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
+	KOKKOS_INTERNAL_USE_SERIAL := 1
+endif
+endif
+
+KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version | grep PGI | wc -l)
+
+ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+  KOKKOS_INTERNAL_OPENMP_FLAG := -mp 
+else
+  KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
+  KOKKOS_INTERNAL_CXX11_FLAG := --c++11
+else
+  KOKKOS_INTERNAL_CXX11_FLAG := --std=c++11
+endif
+# Check for other Execution Spaces
+
+KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
+
+# Check for Kokkos Architecture settings
+
+#Intel based
+KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
+
+#NVIDIA based
+KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler35 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler37 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
+KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37)  \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+                                                      + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
+endif
+
+#ARM based
+KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8 | wc -l))
+
+#IBM based
+KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power7 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power8 | wc -l))
+KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc))
+
+#AMD based
+KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
+
+#Any AVX?
+KOKKOS_INTERNAL_USE_ARCH_AVX  := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
+KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc ))
+
+#Incompatible flags?
+KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)>1" | bc ))
+KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIHOST), 1)
+  $(error Defined Multiple Host architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1)
+  $(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
+endif
+
+#Generating the list of Flags
+
+KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
+# No warnings:
+KOKKOS_CXXFLAGS =
+# INTEL and CLANG warnings:
+#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized
+# GCC warnings:
+#KOKKOS_CXXFLAGS = -Wall -Wshadow -pedantic -Wsign-compare -Wtype-limits -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered
+
+KOKKOS_LIBS = -lkokkos 
+KOKKOS_LDFLAGS = -L$(shell pwd)
+KOKKOS_SRC = 
+KOKKOS_HEADERS =
+
+#Generating the KokkosCore_config.h file
+
+tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
+tmp := $(shell echo "Makefile constructed configuration:" >> KokkosCore_config.tmp)
+tmp := $(shell date >> KokkosCore_config.tmp)
+tmp := $(shell echo "----------------------------------------------*/" >> KokkosCore_config.tmp)
+
+
+tmp := $(shell echo "/* Execution Spaces */" >> KokkosCore_config.tmp)
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp) 
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+	tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
+endif
+
+tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp)
+ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
+	KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
+	tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1)
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	KOKKOS_CXXFLAGS += -G
+endif
+	KOKKOS_CXXFLAGS += -g 
+	KOKKOS_LDFLAGS += -g -ldl
+	tmp := $(shell echo "\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK 1" >> KokkosCore_config.tmp )
+	tmp := $(shell echo "\#define KOKKOS_HAVE_DEBUG 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1)
+	KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include
+	KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib 
+        KOKKOS_LIBS += -lhwloc
+	tmp := $(shell echo "\#define KOKKOS_HAVE_HWLOC 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1)
+	tmp := $(shell echo "\#define KOKKOS_USE_LIBRT 1" >> KokkosCore_config.tmp )
+	tmp := $(shell echo "\#define PREC_TIMER 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOSP_ENABLE_RTLIB 1" >> KokkosCore_config.tmp )
+	KOKKOS_LIBS += -lrt
+endif
+
+tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
+
+ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
+	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1)
+	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp )
+  tmp := $(shell echo "\#define KOKKOS_USE_CUDA_UVM 1" >> KokkosCore_config.tmp )
+endif
+
+ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1)
+	tmp := $(shell echo "\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1" >> KokkosCore_config.tmp )
+	KOKKOS_CXXFLAGS += --relocatable-device-code=true
+	KOKKOS_LDFLAGS += --relocatable-device-code=true
+endif
+
+#Add Architecture flags
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
+	KOKKOS_CXXFLAGS += -mavx
+	KOKKOS_LDFLAGS += -mavx
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
+	KOKKOS_CXXFLAGS += -xcore-avx2
+	KOKKOS_LDFLAGS += -xcore-avx2
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
+	KOKKOS_CXXFLAGS += -mmic
+	KOKKOS_LDFLAGS += -mmic
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
+	KOKKOS_CXXFLAGS += -arch=sm_30
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
+	KOKKOS_CXXFLAGS += -arch=sm_32
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
+	KOKKOS_CXXFLAGS += -arch=sm_35
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
+	KOKKOS_CXXFLAGS += -arch=sm_37
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
+	KOKKOS_CXXFLAGS += -arch=sm_50
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
+	KOKKOS_CXXFLAGS += -arch=sm_52
+endif
+ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
+	KOKKOS_CXXFLAGS += -arch=sm_53
+endif
+endif
+ 
+KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
+ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
+KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
+else
+KOKKOS_INTERNAL_NEW_CONFIG := 1
+endif
+
+ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
+	tmp := $(shell cp KokkosCore_config.tmp KokkosCore_config.h)
+endif
+
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp)
+KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
+
+KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.cpp)
+KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.cpp)
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.cpp)
+	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+	KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 
+	KOKKOS_LIBS += -lcudart -lcuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	KOKKOS_LIBS += -lpthread
+	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
+	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
+	KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+	ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+		KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
+	else
+		KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+	endif
+	KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
+endif
+
+
+# Setting up dependencies
+
+KokkosCore_config.h:
+
+KOKKOS_CPP_DEPENDS := KokkosCore_config.h $(KOKKOS_HEADERS)
+
+KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o)
+KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ))
+
+include $(KOKKOS_PATH)/Makefile.targets
+
+kokkos-clean:
+	rm -f $(KOKKOS_OBJ_LINK) KokkosCore_config.h KokkosCore_config.tmp libkokkos.a
+
+libkokkos.a: $(KOKKOS_OBJ_LINK) $(KOKKOS_SRC) $(KOKKOS_HEADERS)
+	ar cr libkokkos.a $(KOKKOS_OBJ_LINK)
+
+KOKKOS_LINK_DEPENDS=libkokkos.a
diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets
new file mode 100755
index 0000000000000000000000000000000000000000..86708ac80176c18d6cd08547c2715a600edcc997
--- /dev/null
+++ b/lib/kokkos/Makefile.targets
@@ -0,0 +1,50 @@
+Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
+Kokkos_AllocationTracker.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
+Kokkos_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
+Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
+Kokkos_Error.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Error.cpp
+Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp
+Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp
+Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
+Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
+Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
+Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
+Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
+KokkosExp_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/KokkosExp_SharedAlloc.cpp
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+Kokkos_Cuda_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
+Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
+Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+Kokkos_ThreadsExec_base.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec_base.cpp
+Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
+Kokkos_Threads_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
+endif
+
diff --git a/lib/kokkos/README b/lib/kokkos/README
new file mode 100755
index 0000000000000000000000000000000000000000..f979495bfd64ddf3ed12f083e5625920bd372f9c
--- /dev/null
+++ b/lib/kokkos/README
@@ -0,0 +1,97 @@
+Kokkos implements a programming model in C++ for writing performance portable
+applications targeting all major HPC platforms. For that purpose it provides
+abstractions for both parallel execution of code and data management.
+Kokkos is designed to target complex node architectures with N-level memory
+hierarchies and multiple types of execution resources. It currently can use
+OpenMP, Pthreads and CUDA as backend programming models.
+
+The core developers of Kokkos are Carter Edwards and Christian Trott
+at the Computer Science Research Institute of the Sandia National
+Laboratories.
+
+The KokkosP interface and associated tools are developed by the Application
+Performance Team and Kokkos core developers at Sandia National Laboratories.
+
+To learn more about Kokkos consider watching one of our presentations:
+GTC 2015:
+  http://on-demand.gputechconf.com/gtc/2015/video/S5166.html
+  http://on-demand.gputechconf.com/gtc/2015/presentation/S5166-H-Carter-Edwards.pdf
+
+A programming guide can be found under doc/Kokkos_PG.pdf. This is an initial version
+and feedback is greatly appreciated.
+
+For questions please send an email to
+kokkos-users@software.sandia.gov
+
+For non-public questions send an email to
+hcedwar(at)sandia.gov and crtrott(at)sandia.gov
+
+============================================================================
+====Requirements============================================================
+============================================================================
+
+Primary tested compilers are:
+  GCC 4.7.2
+  GCC 5.1.0
+  Intel 14.0.1
+  Intel 15.0.1
+  Clang 3.7.0
+
+Secondary tested compilers are:
+  CUDA 6.5
+  CUDA 7.0
+
+Primary tested compiler are passing in release mode
+with warnings as errors. We are using the following set
+of flags:
+GCC:   -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
+       -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
+Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
+Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
+
+
+============================================================================
+====Getting started=========================================================
+============================================================================
+
+In the 'example/tutorial' directory you will find step by step tutorial
+examples which explain many of the features of Kokkos. They work with
+simple Makefiles. To build with g++ and OpenMP simply type 'make openmp'
+in the 'example/tutorial' directory. This will build all examples in the
+subfolders.
+
+============================================================================
+====Running Unit Tests======================================================
+============================================================================
+
+To run the unit tests create a build directory and run the following commands
+
+KOKKOS_PATH/generate_makefile.bash
+make build-test
+make test
+
+Run KOKKOS_PATH/generate_makefile.bash --help for more detailed options such as
+changing the device type for which to build.
+
+============================================================================
+====Install the library=====================================================
+============================================================================
+
+To install Kokkos as a library create a build directory and run the following
+
+KOKKOS_PATH/generate_makefile.bash --prefix=INSTALL_PATH
+make lib
+make install
+
+KOKKOS_PATH/generate_makefile.bash --help for more detailed options such as
+changing the device type for which to build.
+
+============================================================================
+====CMakeFiles==============================================================
+============================================================================
+
+The CMake files contained in this repository require Tribits and are used
+for integration with Trilinos. They do not currently support a standalone
+CMake build.
+
+
diff --git a/lib/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp b/lib/kokkos/algorithms/src/KokkosAlgorithms_dummy.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..11763c2f10d317ab01940f1df8a32d3923a98fbf
--- /dev/null
+++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp
@@ -0,0 +1,1691 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_RANDOM_HPP
+#define KOKKOS_RANDOM_HPP
+
+#include <Kokkos_Core.hpp>
+#include <cstdio>
+#include <cstdlib>
+#include <cmath>
+
+/// \file Kokkos_Random.hpp
+/// \brief Pseudorandom number generators
+///
+/// These generators are based on Vigna, Sebastiano (2014). "An
+/// experimental exploration of Marsaglia's xorshift generators,
+/// scrambled."  See: http://arxiv.org/abs/1402.6246
+
+namespace Kokkos {
+
+  /*Template functions to get equidistributed random numbers from a generator for a specific Scalar type
+
+       template<class Generator,Scalar>
+       struct rand{
+
+         //Max value returned by draw(Generator& gen)
+         KOKKOS_INLINE_FUNCTION
+         static Scalar max();
+
+         //Returns a value between zero and max()
+         KOKKOS_INLINE_FUNCTION
+         static Scalar draw(Generator& gen);
+
+         //Returns a value between zero and range()
+         //Note: for floating point values range can be larger than max()
+         KOKKOS_INLINE_FUNCTION
+         static Scalar draw(Generator& gen, const Scalar& range){}
+
+         //Return value between start and end
+         KOKKOS_INLINE_FUNCTION
+         static Scalar draw(Generator& gen, const Scalar& start, const Scalar& end);
+      };
+
+    The Random number generators themselves have two components a state-pool and the actual generator
+    A state-pool manages a number of generators, so that each active thread is able to grep its own.
+    This allows the generation of random numbers which are independent between threads. Note that
+    in contrast to CuRand none of the functions of the pool (or the generator) are collectives,
+    i.e. all functions can be called inside conditionals.
+
+    template<class Device>
+    class Pool {
+     public:
+      //The Kokkos device type
+      typedef Device device_type;
+      //The actual generator type
+      typedef Generator<Device> generator_type;
+
+      //Default constructor: does not initialize a pool
+      Pool();
+
+      //Initializing constructor: calls init(seed,Device_Specific_Number);
+      Pool(unsigned int seed);
+
+      //Intialize Pool with seed as a starting seed with a pool_size of num_states
+      //The Random_XorShift64 generator is used in serial to initialize all states,
+      //thus the intialization process is platform independent and deterministic.
+      void init(unsigned int seed, int num_states);
+
+      //Get a generator. This will lock one of the states, guaranteeing that each thread
+      //will have its private generator. Note: on Cuda getting a state involves atomics,
+      //and is thus not deterministic!
+      generator_type get_state();
+
+      //Give a state back to the pool. This unlocks the state, and writes the modified
+      //state of the generator back to the pool.
+      void free_state(generator_type gen);
+
+    }
+
+    template<class Device>
+    class Generator {
+     public:
+     //The Kokkos device type
+    typedef DeviceType device_type;
+
+    //Max return values of respective [X]rand[S]() functions
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+
+    //Init with a state and the idx with respect to pool. Note: in serial the
+    //Generator can be used by just giving it the necessary state arguments
+    KOKKOS_INLINE_FUNCTION
+    Generator (STATE_ARGUMENTS, int state_idx = 0);
+
+    //Draw a equidistributed uint32_t in the range (0,MAX_URAND]
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand();
+
+    //Draw a equidistributed uint64_t in the range (0,MAX_URAND64]
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64();
+
+    //Draw a equidistributed uint32_t in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range);
+
+    //Draw a equidistributed uint32_t in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end );
+
+    //Draw a equidistributed uint64_t in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range);
+
+    //Draw a equidistributed uint64_t in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end );
+
+    //Draw a equidistributed int in the range (0,MAX_RAND]
+    KOKKOS_INLINE_FUNCTION
+    int rand();
+
+    //Draw a equidistributed int in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range);
+
+    //Draw a equidistributed int in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end );
+
+    //Draw a equidistributed int64_t in the range (0,MAX_RAND64]
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64();
+
+    //Draw a equidistributed int64_t in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range);
+
+    //Draw a equidistributed int64_t in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end );
+
+    //Draw a equidistributed float in the range (0,1.0]
+    KOKKOS_INLINE_FUNCTION
+    float frand();
+
+    //Draw a equidistributed float in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range);
+
+    //Draw a equidistributed float in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end );
+
+    //Draw a equidistributed double in the range (0,1.0]
+    KOKKOS_INLINE_FUNCTION
+    double drand();
+
+    //Draw a equidistributed double in the range (0,range]
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range);
+
+    //Draw a equidistributed double in the range (start,end]
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end );
+
+    //Draw a standard normal distributed double
+    KOKKOS_INLINE_FUNCTION
+    double normal() ;
+
+    //Draw a normal distributed double with given mean and standard deviation
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0);
+    }
+
+    //Additional Functions:
+
+    //Fills view with random numbers in the range (0,range]
+    template<class ViewType, class PoolType>
+    void fill_random(ViewType view, PoolType pool, ViewType::value_type range);
+
+    //Fills view with random numbers in the range (start,end]
+    template<class ViewType, class PoolType>
+    void fill_random(ViewType view, PoolType pool,
+                     ViewType::value_type start, ViewType::value_type end);
+
+*/
+
+  template<class Generator, class Scalar>
+  struct rand;
+
+
+  template<class Generator>
+  struct rand<Generator,char> {
+
+    KOKKOS_INLINE_FUNCTION
+    static short max(){return 127;}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen)
+                          {return short((gen.rand()&0xff+256)%256);}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const char& range)
+                          {return char(gen.rand(range));}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const char& start, const char& end)
+                          {return char(gen.rand(start,end));}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,short> {
+    KOKKOS_INLINE_FUNCTION
+    static short max(){return 32767;}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen)
+                          {return short((gen.rand()&0xffff+65536)%32768);}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const short& range)
+                          {return short(gen.rand(range));}
+    KOKKOS_INLINE_FUNCTION
+    static short draw(Generator& gen, const short& start, const short& end)
+                          {return short(gen.rand(start,end));}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,int> {
+    KOKKOS_INLINE_FUNCTION
+    static int max(){return Generator::MAX_RAND;}
+    KOKKOS_INLINE_FUNCTION
+    static int draw(Generator& gen)
+                          {return gen.rand();}
+    KOKKOS_INLINE_FUNCTION
+    static int draw(Generator& gen, const int& range)
+                          {return gen.rand(range);}
+    KOKKOS_INLINE_FUNCTION
+    static int draw(Generator& gen, const int& start, const int& end)
+                          {return gen.rand(start,end);}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,unsigned int> {
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int max () {
+      return Generator::MAX_URAND;
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int draw (Generator& gen) {
+      return gen.urand ();
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int draw(Generator& gen, const unsigned int& range) {
+      return gen.urand (range);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned int
+    draw (Generator& gen, const unsigned int& start, const unsigned int& end) {
+      return gen.urand (start, end);
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator,long> {
+    KOKKOS_INLINE_FUNCTION
+    static long max () {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (Generator::MAX_RAND) :
+        static_cast<long> (Generator::MAX_RAND64);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (gen.rand ()) :
+        static_cast<long> (gen.rand64 ());
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long draw (Generator& gen, const long& range) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (gen.rand (static_cast<int> (range))) :
+        static_cast<long> (gen.rand64 (range));
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long draw (Generator& gen, const long& start, const long& end) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (long) == 4 ?
+        static_cast<long> (gen.rand (static_cast<int> (start),
+                                     static_cast<int> (end))) :
+        static_cast<long> (gen.rand64 (start, end));
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator,unsigned long> {
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long max () {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (Generator::MAX_URAND) :
+        static_cast<unsigned long> (Generator::MAX_URAND64);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (gen.urand ()) :
+        static_cast<unsigned long> (gen.urand64 ());
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long draw(Generator& gen, const unsigned long& range) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (gen.urand (static_cast<unsigned int> (range))) :
+        static_cast<unsigned long> (gen.urand64 (range));
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long
+    draw (Generator& gen, const unsigned long& start, const unsigned long& end) {
+      // FIXME (mfh 26 Oct 2014) It would be better to select the
+      // return value at compile time, using something like enable_if.
+      return sizeof (unsigned long) == 4 ?
+        static_cast<unsigned long> (gen.urand (static_cast<unsigned int> (start),
+                                               static_cast<unsigned int> (end))) :
+        static_cast<unsigned long> (gen.urand64 (start, end));
+    }
+  };
+
+  // NOTE (mfh 26 oct 2014) This is a partial specialization for long
+  // long, a C99 / C++11 signed type which is guaranteed to be at
+  // least 64 bits.  Do NOT write a partial specialization for
+  // int64_t!!!  This is just a typedef!  It could be either long or
+  // long long.  We don't know which a priori, and I've seen both.
+  // The types long and long long are guaranteed to differ, so it's
+  // always safe to specialize for both.
+  template<class Generator>
+  struct rand<Generator, long long> {
+    KOKKOS_INLINE_FUNCTION
+    static long long max () {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return Generator::MAX_RAND64;
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.rand64 ();
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long long draw (Generator& gen, const long long& range) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.rand64 (range);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static long long draw (Generator& gen, const long long& start, const long long& end) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.rand64 (start, end);
+    }
+  };
+
+  // NOTE (mfh 26 oct 2014) This is a partial specialization for
+  // unsigned long long, a C99 / C++11 unsigned type which is
+  // guaranteed to be at least 64 bits.  Do NOT write a partial
+  // specialization for uint64_t!!!  This is just a typedef!  It could
+  // be either unsigned long or unsigned long long.  We don't know
+  // which a priori, and I've seen both.  The types unsigned long and
+  // unsigned long long are guaranteed to differ, so it's always safe
+  // to specialize for both.
+  template<class Generator>
+  struct rand<Generator,unsigned long long> {
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long max () {
+      // FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64 bits.
+      return Generator::MAX_URAND64;
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long draw (Generator& gen) {
+      // FIXME (mfh 26 Oct 2014) It's legal for unsigned long long to be > 64 bits.
+      return gen.urand64 ();
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long draw (Generator& gen, const unsigned long long& range) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.urand64 (range);
+    }
+    KOKKOS_INLINE_FUNCTION
+    static unsigned long long
+    draw (Generator& gen, const unsigned long long& start, const unsigned long long& end) {
+      // FIXME (mfh 26 Oct 2014) It's legal for long long to be > 64 bits.
+      return gen.urand64 (start, end);
+    }
+  };
+
+  template<class Generator>
+  struct rand<Generator,float> {
+    KOKKOS_INLINE_FUNCTION
+    static float max(){return 1.0f;}
+    KOKKOS_INLINE_FUNCTION
+    static float draw(Generator& gen)
+                          {return gen.frand();}
+    KOKKOS_INLINE_FUNCTION
+    static float draw(Generator& gen, const float& range)
+                          {return gen.frand(range);}
+    KOKKOS_INLINE_FUNCTION
+    static float draw(Generator& gen, const float& start, const float& end)
+                          {return gen.frand(start,end);}
+
+  };
+
+  template<class Generator>
+  struct rand<Generator,double> {
+    KOKKOS_INLINE_FUNCTION
+    static double max(){return 1.0;}
+    KOKKOS_INLINE_FUNCTION
+    static double draw(Generator& gen)
+                          {return gen.drand();}
+    KOKKOS_INLINE_FUNCTION
+    static double draw(Generator& gen, const double& range)
+                          {return gen.drand(range);}
+    KOKKOS_INLINE_FUNCTION
+    static double draw(Generator& gen, const double& start, const double& end)
+                          {return gen.drand(start,end);}
+
+  };
+
+  template<class DeviceType>
+  class Random_XorShift64_Pool;
+
+  template<class DeviceType>
+  class Random_XorShift64 {
+  private:
+    uint64_t state_;
+    const int state_idx_;
+    friend class Random_XorShift64_Pool<DeviceType>;
+  public:
+
+    typedef DeviceType device_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffff/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffLL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift64 (uint64_t state, int state_idx = 0)
+     : state_(state),state_idx_(state_idx){}
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      state_ ^= state_ >> 12;
+      state_ ^= state_ << 25;
+      state_ ^= state_ >> 27;
+
+      uint64_t tmp = state_ * 2685821657736338717ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      state_ ^= state_ >> 12;
+      state_ ^= state_ << 25;
+      state_ ^= state_ >> 27;
+      return (state_ * 2685821657736338717ULL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        tmp = urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        tmp = urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        tmp = rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        tmp = rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return drand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = drand();
+        const double V = drand();
+        S = U*U+V*V;
+      }
+      return U*sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+
+  };
+
+  template<class DeviceType = Kokkos::DefaultExecutionSpace>
+  class Random_XorShift64_Pool {
+  private:
+    typedef View<int*,DeviceType> lock_type;
+    typedef View<uint64_t*,DeviceType> state_data_type;
+    lock_type locks_;
+    state_data_type state_;
+    int num_states_;
+
+  public:
+    typedef Random_XorShift64<DeviceType> generator_type;
+    typedef DeviceType device_type;
+
+    Random_XorShift64_Pool() {
+      num_states_ = 0;
+    }
+    Random_XorShift64_Pool(uint64_t seed) {
+      num_states_ = 0;
+      init(seed,DeviceType::max_hardware_threads());
+    }
+
+    Random_XorShift64_Pool(const Random_XorShift64_Pool& src):
+      locks_(src.locks_),
+      state_(src.state_),
+      num_states_(src.num_states_)
+    {}
+
+    Random_XorShift64_Pool operator = (const Random_XorShift64_Pool& src) {
+      locks_ = src.locks_;
+      state_ = src.state_;
+      num_states_ = src.num_states_;
+      return *this;
+    }
+
+    void init(uint64_t seed, int num_states) {
+      num_states_ = num_states;
+
+      locks_ = lock_type("Kokkos::Random_XorShift64::locks",num_states_);
+      state_ = state_data_type("Kokkos::Random_XorShift64::state",num_states_);
+
+      typename state_data_type::HostMirror h_state = create_mirror_view(state_);
+      typename lock_type::HostMirror h_lock = create_mirror_view(locks_);
+
+      // Execute on the HostMirror's default execution space.
+      Random_XorShift64<typename state_data_type::HostMirror::execution_space> gen(seed,0);
+      for(int i = 0; i < 17; i++)
+        gen.rand();
+      for(int i = 0; i < num_states_; i++) {
+        int n1 = gen.rand();
+        int n2 = gen.rand();
+        int n3 = gen.rand();
+        int n4 = gen.rand();
+        h_state(i) = (((static_cast<uint64_t>(n1)) & 0xffff)<<00) |
+                     (((static_cast<uint64_t>(n2)) & 0xffff)<<16) |
+                     (((static_cast<uint64_t>(n3)) & 0xffff)<<32) |
+                     (((static_cast<uint64_t>(n4)) & 0xffff)<<48);
+        h_lock(i) = 0;
+      }
+      deep_copy(state_,h_state);
+      deep_copy(locks_,h_lock);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift64<DeviceType> get_state() const {
+      const int i = DeviceType::hardware_thread_id();;
+      return Random_XorShift64<DeviceType>(state_(i),i);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void free_state(const Random_XorShift64<DeviceType>& state) const {
+      state_(state.state_idx_) = state.state_;
+    }
+  };
+
+
+  template<class DeviceType>
+  class Random_XorShift1024_Pool;
+
+  template<class DeviceType>
+  class Random_XorShift1024 {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t state_[16];
+    friend class Random_XorShift1024_Pool<DeviceType>;
+  public:
+
+    typedef DeviceType device_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx){
+      for(int i=0 ; i<16; i++)
+        state_[i] = state[i];
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ ];
+      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ ];
+      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        tmp = urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        tmp = urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        tmp = rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        tmp = rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = drand();
+        const double V = drand();
+        S = U*U+V*V;
+      }
+      return U*sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+
+  template<class DeviceType = Kokkos::DefaultExecutionSpace>
+  class Random_XorShift1024_Pool {
+  private:
+    typedef View<int*,DeviceType> int_view_type;
+    typedef View<uint64_t*[16],DeviceType> state_data_type;
+
+    int_view_type locks_;
+    state_data_type state_;
+    int_view_type p_;
+    int num_states_;
+
+  public:
+    typedef Random_XorShift1024<DeviceType> generator_type;
+
+    typedef DeviceType device_type;
+
+    Random_XorShift1024_Pool() {
+      num_states_ = 0;
+    }
+
+    inline
+    Random_XorShift1024_Pool(uint64_t seed){
+      num_states_ = 0;
+      init(seed,DeviceType::max_hardware_threads());
+    }
+
+    Random_XorShift1024_Pool(const Random_XorShift1024_Pool& src):
+      locks_(src.locks_),
+      state_(src.state_),
+      p_(src.p_),
+      num_states_(src.num_states_)
+    {}
+
+    Random_XorShift1024_Pool operator = (const Random_XorShift1024_Pool& src) {
+      locks_ = src.locks_;
+      state_ = src.state_;
+      p_ = src.p_;
+      num_states_ = src.num_states_;
+      return *this;
+    }
+
+    inline
+    void init(uint64_t seed, int num_states) {
+      num_states_ = num_states;
+
+      locks_ = int_view_type("Kokkos::Random_XorShift1024::locks",num_states_);
+      state_ = state_data_type("Kokkos::Random_XorShift1024::state",num_states_);
+      p_ = int_view_type("Kokkos::Random_XorShift1024::p",num_states_);
+
+      typename state_data_type::HostMirror h_state = create_mirror_view(state_);
+      typename int_view_type::HostMirror h_lock = create_mirror_view(locks_);
+      typename int_view_type::HostMirror h_p = create_mirror_view(p_);
+
+      // Execute on the HostMirror's default execution space.
+      Random_XorShift64<typename state_data_type::HostMirror::execution_space> gen(seed,0);
+      for(int i = 0; i < 17; i++)
+        gen.rand();
+      for(int i = 0; i < num_states_; i++) {
+        for(int j = 0; j < 16 ; j++) {
+          int n1 = gen.rand();
+          int n2 = gen.rand();
+          int n3 = gen.rand();
+          int n4 = gen.rand();
+          h_state(i,j) = (((static_cast<uint64_t>(n1)) & 0xffff)<<00) |
+                         (((static_cast<uint64_t>(n2)) & 0xffff)<<16) |
+                         (((static_cast<uint64_t>(n3)) & 0xffff)<<32) |
+                         (((static_cast<uint64_t>(n4)) & 0xffff)<<48);
+        }
+        h_p(i) = 0;
+        h_lock(i) = 0;
+      }
+      deep_copy(state_,h_state);
+      deep_copy(locks_,h_lock);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024<DeviceType> get_state() const {
+      const int i = DeviceType::hardware_thread_id();
+      return Random_XorShift1024<DeviceType>(&state_(i,0),p_(i),i);
+    };
+
+    KOKKOS_INLINE_FUNCTION
+    void free_state(const Random_XorShift1024<DeviceType>& state) const {
+      for(int i = 0; i<16; i++)
+        state_(state.state_idx_,i) = state.state_[i];
+      p_(state.state_idx_) = state.p_;
+    }
+  };
+
+#if defined(KOKKOS_HAVE_CUDA) && defined(__CUDACC__)
+
+  template<>
+  class Random_XorShift1024<Kokkos::Cuda> {
+  private:
+    int p_;
+    const int state_idx_;
+    uint64_t* state_;
+    friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
+  public:
+
+    typedef Kokkos::Cuda device_type;
+
+    enum {MAX_URAND = 0xffffffffU};
+    enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
+    enum {MAX_RAND = static_cast<int>(0xffffffffU/2)};
+    enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
+
+    KOKKOS_INLINE_FUNCTION
+    Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0):
+      p_(p),state_idx_(state_idx),state_(state){
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand() {
+      uint64_t state_0 = state_[ p_ ];
+      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
+      tmp = tmp>>16;
+      return static_cast<uint32_t>(tmp&MAX_URAND);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64() {
+      uint64_t state_0 = state_[ p_ ];
+      uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ];
+      state_1 ^= state_1 << 31;
+      state_1 ^= state_1 >> 11;
+      state_0 ^= state_0 >> 30;
+      return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& range) {
+      const uint32_t max_val = (MAX_URAND/range)*range;
+      uint32_t tmp = urand();
+      while(tmp>=max_val)
+        urand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint32_t urand(const uint32_t& start, const uint32_t& end ) {
+      return urand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& range) {
+      const uint64_t max_val = (MAX_URAND64/range)*range;
+      uint64_t tmp = urand64();
+      while(tmp>=max_val)
+        urand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    uint64_t urand64(const uint64_t& start, const uint64_t& end ) {
+      return urand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand() {
+      return static_cast<int>(urand()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& range) {
+      const int max_val = (MAX_RAND/range)*range;
+      int tmp = rand();
+      while(tmp>=max_val)
+        rand();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int rand(const int& start, const int& end ) {
+      return rand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64() {
+      return static_cast<int64_t>(urand64()/2);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& range) {
+      const int64_t max_val = (MAX_RAND64/range)*range;
+      int64_t tmp = rand64();
+      while(tmp>=max_val)
+        rand64();
+      return tmp%range;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int64_t rand64(const int64_t& start, const int64_t& end ) {
+      return rand64(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand() {
+      return 1.0f * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float frand(const float& start, const float& end ) {
+      return frand(end-start)+start;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand() {
+      return 1.0 * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& range) {
+      return range * urand64()/MAX_URAND64;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double drand(const double& start, const double& end ) {
+      return frand(end-start)+start;
+    }
+
+    //Marsaglia polar method for drawing a standard normal distributed random number
+    KOKKOS_INLINE_FUNCTION
+    double normal() {
+      double S = 2.0;
+      double U;
+      while(S>=1.0) {
+        U = drand();
+        const double V = drand();
+        S = U*U+V*V;
+      }
+      return U*sqrt(-2.0*log(S)/S);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double normal(const double& mean, const double& std_dev=1.0) {
+      return mean + normal()*std_dev;
+    }
+  };
+
+template<>
+inline
+Random_XorShift64_Pool<Kokkos::Cuda>::Random_XorShift64_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift64<Kokkos::Cuda> Random_XorShift64_Pool<Kokkos::Cuda>::get_state() const {
+#ifdef __CUDA_ARCH__
+  const int i_offset = (threadIdx.x*blockDim.y + threadIdx.y)*blockDim.z+threadIdx.z;
+  int i = ((blockIdx.x*gridDim.y+blockIdx.y)*gridDim.z + blockIdx.z) *
+           blockDim.x*blockDim.y*blockDim.z + i_offset;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim.x*blockDim.y*blockDim.z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift64<Kokkos::Cuda>(state_(i),i);
+#else
+  return Random_XorShift64<Kokkos::Cuda>(state_(0),0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift64_Pool<Kokkos::Cuda>::free_state(const Random_XorShift64<Kokkos::Cuda> &state) const {
+#ifdef __CUDA_ARCH__
+  state_(state.state_idx_) = state.state_;
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+template<>
+inline
+Random_XorShift1024_Pool<Kokkos::Cuda>::Random_XorShift1024_Pool(uint64_t seed) {
+  num_states_ = 0;
+  init(seed,4*32768);
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_state() const {
+#ifdef __CUDA_ARCH__
+  const int i_offset = (threadIdx.x*blockDim.y + threadIdx.y)*blockDim.z+threadIdx.z;
+  int i = ((blockIdx.x*gridDim.y+blockIdx.y)*gridDim.z + blockIdx.z) *
+           blockDim.x*blockDim.y*blockDim.z + i_offset;
+  while(Kokkos::atomic_compare_exchange(&locks_(i),0,1)) {
+      i+=blockDim.x*blockDim.y*blockDim.z;
+      if(i>=num_states_) {i = i_offset;}
+  }
+
+  return Random_XorShift1024<Kokkos::Cuda>(&state_(i,0), p_(i), i);
+#else
+  return Random_XorShift1024<Kokkos::Cuda>(&state_(0,0), p_(0), 0);
+#endif
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift1024<Kokkos::Cuda> &state) const {
+#ifdef __CUDA_ARCH__
+  for(int i=0; i<16; i++)
+    state_(state.state_idx_,i) = state.state_[i];
+  locks_(state.state_idx_) = 0;
+  return;
+#endif
+}
+
+
+#endif
+
+
+
+template<class ViewType, class RandomPool, int loops, int rank>
+struct fill_random_functor_range;
+template<class ViewType, class RandomPool, int loops, int rank>
+struct fill_random_functor_begin_end;
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0())
+        a(idx) = Rand::draw(gen,range);
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          a(idx,k) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            a(idx,k,l) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            for(unsigned int m=0;m<a.dimension_3();m++)
+              a(idx,k,l,m) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            for(unsigned int m=0;m<a.dimension_3();m++)
+              for(unsigned int n=0;n<a.dimension_4();n++)
+              a(idx,k,l,m,n) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            for(unsigned int m=0;m<a.dimension_3();m++)
+              for(unsigned int n=0;n<a.dimension_4();n++)
+                for(unsigned int o=0;o<a.dimension_5();o++)
+              a(idx,k,l,m,n,o) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            for(unsigned int m=0;m<a.dimension_3();m++)
+              for(unsigned int n=0;n<a.dimension_4();n++)
+                for(unsigned int o=0;o<a.dimension_5();o++)
+                  for(unsigned int p=0;p<a.dimension_6();p++)
+              a(idx,k,l,m,n,o,p) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type range;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_range(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type range_):
+    a(a_),rand_pool(rand_pool_),range(range_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            for(unsigned int m=0;m<a.dimension_3();m++)
+              for(unsigned int n=0;n<a.dimension_4();n++)
+                for(unsigned int o=0;o<a.dimension_5();o++)
+                  for(unsigned int p=0;p<a.dimension_6();p++)
+                    for(unsigned int q=0;q<a.dimension_7();q++)
+              a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0())
+        a(idx) = Rand::draw(gen,begin,end);
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          a(idx,k) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            a(idx,k,l) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            for(unsigned int m=0;m<a.dimension_3();m++)
+              a(idx,k,l,m) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()){
+        for(unsigned int l=0;l<a.dimension_1();l++)
+          for(unsigned int m=0;m<a.dimension_2();m++)
+            for(unsigned int n=0;n<a.dimension_3();n++)
+              for(unsigned int o=0;o<a.dimension_4();o++)
+          a(idx,l,m,n,o) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            for(unsigned int m=0;m<a.dimension_3();m++)
+              for(unsigned int n=0;n<a.dimension_4();n++)
+                for(unsigned int o=0;o<a.dimension_5();o++)
+          a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            for(unsigned int m=0;m<a.dimension_3();m++)
+              for(unsigned int n=0;n<a.dimension_4();n++)
+                for(unsigned int o=0;o<a.dimension_5();o++)
+                  for(unsigned int p=0;p<a.dimension_6();p++)
+            a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool, int loops>
+struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
+  typedef typename ViewType::execution_space execution_space;
+  ViewType a;
+  RandomPool rand_pool;
+  typename ViewType::const_value_type begin,end;
+
+  typedef rand<typename RandomPool::generator_type, typename ViewType::non_const_value_type> Rand;
+
+  fill_random_functor_begin_end(ViewType a_, RandomPool rand_pool_,
+      typename ViewType::const_value_type begin_, typename ViewType::const_value_type end_):
+    a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (unsigned int i) const {
+    typename RandomPool::generator_type gen = rand_pool.get_state();
+    for(unsigned int j=0;j<loops;j++) {
+      const uint64_t idx = i*loops+j;
+      if(idx<a.dimension_0()) {
+        for(unsigned int k=0;k<a.dimension_1();k++)
+          for(unsigned int l=0;l<a.dimension_2();l++)
+            for(unsigned int m=0;m<a.dimension_3();m++)
+              for(unsigned int n=0;n<a.dimension_4();n++)
+                for(unsigned int o=0;o<a.dimension_5();o++)
+                  for(unsigned int p=0;p<a.dimension_6();p++)
+                    for(unsigned int q=0;q<a.dimension_7();q++)
+              a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end);
+      }
+    }
+    rand_pool.free_state(gen);
+  }
+};
+
+template<class ViewType, class RandomPool>
+void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) {
+  int64_t LDA = a.dimension_0();
+  if(LDA>0)
+    parallel_for((LDA+127)/128,fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank>(a,g,range));
+}
+
+template<class ViewType, class RandomPool>
+void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) {
+  int64_t LDA = a.dimension_0();
+  if(LDA>0)
+    parallel_for((LDA+127)/128,fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank>(a,g,begin,end));
+}
+}
+
+#endif
diff --git a/lib/kokkos/algorithms/src/Kokkos_Sort.hpp b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..8d97472aa9f0838d6d0a740a7717f21015d35639
--- /dev/null
+++ b/lib/kokkos/algorithms/src/Kokkos_Sort.hpp
@@ -0,0 +1,496 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_SORT_HPP_
+#define KOKKOS_SORT_HPP_
+
+#include <Kokkos_Core.hpp>
+
+#include <algorithm>
+
+namespace Kokkos {
+
+  namespace SortImpl {
+
+  template<class ValuesViewType, int Rank=ValuesViewType::Rank>
+  struct CopyOp;
+
+  template<class ValuesViewType>
+  struct CopyOp<ValuesViewType,1> {
+    template<class DstType, class SrcType>
+    KOKKOS_INLINE_FUNCTION
+    static void copy(DstType& dst, size_t i_dst,
+                     SrcType& src, size_t i_src ) {
+      dst(i_dst) = src(i_src);
+    }
+  };
+
+  template<class ValuesViewType>
+  struct CopyOp<ValuesViewType,2> {
+    template<class DstType, class SrcType>
+    KOKKOS_INLINE_FUNCTION
+    static void copy(DstType& dst, size_t i_dst,
+                     SrcType& src, size_t i_src ) {
+      for(int j = 0;j< (int) dst.dimension_1(); j++)
+        dst(i_dst,j) = src(i_src,j);
+    }
+  };
+
+  template<class ValuesViewType>
+  struct CopyOp<ValuesViewType,3> {
+    template<class DstType, class SrcType>
+    KOKKOS_INLINE_FUNCTION
+    static void copy(DstType& dst, size_t i_dst,
+                     SrcType& src, size_t i_src ) {
+      for(int j = 0; j<dst.dimension_1(); j++)
+        for(int k = 0; k<dst.dimension_2(); k++)
+          dst(i_dst,j,k) = src(i_src,j,k);
+    }
+  };
+  }
+
+template<class KeyViewType, class BinSortOp, class ExecutionSpace = typename KeyViewType::execution_space,
+         class SizeType = typename KeyViewType::memory_space::size_type>
+class BinSort {
+
+
+public:
+  template<class ValuesViewType, class PermuteViewType, class CopyOp>
+  struct bin_sort_sort_functor {
+    typedef ExecutionSpace execution_space;
+    typedef typename ValuesViewType::non_const_type values_view_type;
+    typedef typename ValuesViewType::const_type const_values_view_type;
+    Kokkos::View<typename values_view_type::const_data_type,typename values_view_type::array_layout,
+                 typename values_view_type::memory_space,Kokkos::MemoryTraits<Kokkos::RandomAccess> > values;
+    values_view_type sorted_values;
+    typename PermuteViewType::const_type sort_order;
+    bin_sort_sort_functor(const_values_view_type values_, values_view_type  sorted_values_, PermuteViewType sort_order_):
+       values(values_),sorted_values(sorted_values_),sort_order(sort_order_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int& i)  const {
+      //printf("Sort: %i %i\n",i,sort_order(i));
+      CopyOp::copy(sorted_values,i,values,sort_order(i));
+    }
+  };
+
+  typedef ExecutionSpace execution_space;
+  typedef BinSortOp bin_op_type;
+
+  struct bin_count_tag {};
+  struct bin_offset_tag {};
+  struct bin_binning_tag {};
+  struct bin_sort_bins_tag {};
+
+public:
+  typedef SizeType size_type;
+  typedef size_type value_type;
+
+  typedef Kokkos::View<size_type*, execution_space> offset_type;
+  typedef Kokkos::View<const int*, execution_space> bin_count_type;
+
+
+  typedef Kokkos::View<typename KeyViewType::const_data_type,
+                       typename KeyViewType::array_layout,
+                       typename KeyViewType::memory_space> const_key_view_type;
+  typedef Kokkos::View<typename KeyViewType::const_data_type,
+                       typename KeyViewType::array_layout,
+                       typename KeyViewType::memory_space,
+                       Kokkos::MemoryTraits<Kokkos::RandomAccess> > const_rnd_key_view_type;
+
+  typedef typename KeyViewType::non_const_value_type non_const_key_scalar;
+  typedef typename KeyViewType::const_value_type     const_key_scalar;
+
+private:
+  const_key_view_type keys;
+  const_rnd_key_view_type keys_rnd;
+
+public:
+  BinSortOp bin_op;
+
+  offset_type bin_offsets;
+
+  Kokkos::View<int*, ExecutionSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic;
+  bin_count_type bin_count_const;
+
+  offset_type sort_order;
+
+  bool sort_within_bins;
+
+public:
+
+  // Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
+  BinSort(const_key_view_type keys_, BinSortOp bin_op_,
+          bool sort_within_bins_ = false)
+     :keys(keys_),keys_rnd(keys_), bin_op(bin_op_) {
+
+    bin_count_atomic = Kokkos::View<int*, ExecutionSpace >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
+    bin_count_const =  bin_count_atomic;
+    bin_offsets =      offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
+    sort_order =       offset_type("PermutationVector",keys.dimension_0());
+    sort_within_bins = sort_within_bins_;
+  }
+
+  // Create the permutation vector, the bin_offset array and the bin_count array. Can be called again if keys changed
+  void create_permute_vector() {
+    Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_count_tag>    (0,keys.dimension_0()),*this);
+    Kokkos::parallel_scan(Kokkos::RangePolicy<ExecutionSpace,bin_offset_tag>   (0,bin_op.max_bins()) ,*this);
+
+    Kokkos::deep_copy(bin_count_atomic,0);
+    Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_binning_tag>  (0,keys.dimension_0()),*this);
+
+    if(sort_within_bins)
+      Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
+  }
+
+  // Sort a view with respect ot the first dimension using the permutation array
+  template<class ValuesViewType>
+  void sort(ValuesViewType values) {
+    ValuesViewType sorted_values = ValuesViewType("Copy",
+           values.dimension_0(),
+           values.dimension_1(),
+           values.dimension_2(),
+           values.dimension_3(),
+           values.dimension_4(),
+           values.dimension_5(),
+           values.dimension_6(),
+           values.dimension_7());
+
+    parallel_for(values.dimension_0(),
+        bin_sort_sort_functor<ValuesViewType, offset_type,
+                              SortImpl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
+
+    deep_copy(values,sorted_values);
+  }
+
+  // Get the permutation vector
+  KOKKOS_INLINE_FUNCTION
+  offset_type get_permute_vector() const { return sort_order;}
+
+  // Get the start offsets for each bin
+  KOKKOS_INLINE_FUNCTION
+  offset_type get_bin_offsets() const { return bin_offsets;}
+
+  // Get the count for each bin
+  KOKKOS_INLINE_FUNCTION
+  bin_count_type get_bin_count() const {return bin_count_const;}
+
+public:
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_count_tag& tag, const int& i) const {
+    bin_count_atomic(bin_op.bin(keys,i))++;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_offset_tag& tag, const int& i, value_type& offset, const bool& final)  const {
+    if(final) {
+      bin_offsets(i) = offset;
+    }
+    offset+=bin_count_const(i);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_binning_tag& tag, const int& i)  const {
+    const int bin = bin_op.bin(keys,i);
+    const int count = bin_count_atomic(bin)++;
+
+    sort_order(bin_offsets(bin) + count) = i;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const bin_sort_bins_tag& tag, const int&i )  const {
+    bool sorted = false;
+    int upper_bound = bin_offsets(i)+bin_count_const(i);
+    while(!sorted) {
+      sorted = true;
+      int old_idx = sort_order(bin_offsets(i));
+      int new_idx;
+      for(int k=bin_offsets(i)+1; k<upper_bound; k++) {
+        new_idx = sort_order(k);
+
+        if(!bin_op(keys_rnd,old_idx,new_idx)) {
+          sort_order(k-1) = new_idx;
+          sort_order(k) = old_idx;
+          sorted = false;
+        } else {
+          old_idx = new_idx;
+        }
+      }
+      upper_bound--;
+    }
+  }
+};
+
+namespace SortImpl {
+
+template<class KeyViewType>
+struct DefaultBinOp1D {
+  const int max_bins_;
+  const double mul_;
+  typename KeyViewType::const_value_type range_;
+  typename KeyViewType::const_value_type min_;
+
+  //Construct BinOp with number of bins, minimum value and maxuimum value
+  DefaultBinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
+                               typename KeyViewType::const_value_type max )
+     :max_bins_(max_bins__+1),mul_(1.0*max_bins__/(max-min)),range_(max-min),min_(min) {}
+
+  //Determine bin index from key value
+  template<class ViewType>
+  KOKKOS_INLINE_FUNCTION
+  int bin(ViewType& keys, const int& i) const {
+    return int(mul_*(keys(i)-min_));
+  }
+
+  //Return maximum bin index + 1
+  KOKKOS_INLINE_FUNCTION
+  int max_bins() const {
+    return max_bins_;
+  }
+
+  //Compare to keys within a bin if true new_val will be put before old_val
+  template<class ViewType, typename iType1, typename iType2>
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ViewType& keys, iType1& i1, iType2& i2) const {
+    return keys(i1)<keys(i2);
+  }
+};
+
+template<class KeyViewType>
+struct DefaultBinOp3D {
+  int max_bins_[3];
+  double mul_[3];
+  typename KeyViewType::non_const_value_type range_[3];
+  typename KeyViewType::non_const_value_type min_[3];
+
+  DefaultBinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
+                               typename KeyViewType::const_value_type max[] )
+  {
+    max_bins_[0] = max_bins__[0]+1;
+    max_bins_[1] = max_bins__[1]+1;
+    max_bins_[2] = max_bins__[2]+1;
+    mul_[0] = 1.0*max_bins__[0]/(max[0]-min[0]);
+    mul_[1] = 1.0*max_bins__[1]/(max[1]-min[1]);
+    mul_[2] = 1.0*max_bins__[2]/(max[2]-min[2]);
+    range_[0] = max[0]-min[0];
+    range_[1] = max[1]-min[1];
+    range_[2] = max[2]-min[2];
+    min_[0] = min[0];
+    min_[1] = min[1];
+    min_[2] = min[2];
+  }
+
+  template<class ViewType>
+  KOKKOS_INLINE_FUNCTION
+  int bin(ViewType& keys, const int& i) const {
+    return int( (((int(mul_[0]*(keys(i,0)-min_[0]))*max_bins_[1]) +
+                   int(mul_[1]*(keys(i,1)-min_[1])))*max_bins_[2]) +
+                   int(mul_[2]*(keys(i,2)-min_[2])));
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  int max_bins() const {
+    return max_bins_[0]*max_bins_[1]*max_bins_[2];
+  }
+
+  template<class ViewType, typename iType1, typename iType2>
+  KOKKOS_INLINE_FUNCTION
+  bool operator()(ViewType& keys, iType1& i1 , iType2& i2) const {
+    if (keys(i1,0)>keys(i2,0)) return true;
+    else if (keys(i1,0)==keys(i2,0)) {
+      if (keys(i1,1)>keys(i2,1)) return true;
+      else if (keys(i1,1)==keys(i2,2)) {
+        if (keys(i1,2)>keys(i2,2)) return true;
+      }
+    }
+    return false;
+  }
+};
+
+template<typename Scalar>
+struct min_max {
+  Scalar min;
+  Scalar max;
+  bool init;
+
+  KOKKOS_INLINE_FUNCTION
+  min_max() {
+    min = 0;
+    max = 0;
+    init = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  min_max (const min_max& val) {
+    min = val.min;
+    max = val.max;
+    init = val.init;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  min_max operator = (const min_max& val) {
+    min = val.min;
+    max = val.max;
+    init = val.init;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+= (const Scalar& val) {
+    if(init) {
+      min = min<val?min:val;
+      max = max>val?max:val;
+    } else {
+      min = val;
+      max = val;
+      init = 1;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+= (const min_max& val) {
+    if(init && val.init) {
+      min = min<val.min?min:val.min;
+      max = max>val.max?max:val.max;
+    } else {
+      if(val.init) {
+        min = val.min;
+        max = val.max;
+        init = 1;
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+= (volatile const Scalar& val) volatile {
+    if(init) {
+      min = min<val?min:val;
+      max = max>val?max:val;
+    } else {
+      min = val;
+      max = val;
+      init = 1;
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+= (volatile const min_max& val) volatile {
+    if(init && val.init) {
+      min = min<val.min?min:val.min;
+      max = max>val.max?max:val.max;
+    } else {
+      if(val.init) {
+        min = val.min;
+        max = val.max;
+        init = 1;
+      }
+    }
+  }
+};
+
+
+template<class ViewType>
+struct min_max_functor {
+  typedef typename ViewType::execution_space execution_space;
+  ViewType view;
+  typedef min_max<typename ViewType::non_const_value_type> value_type;
+  min_max_functor (const ViewType view_):view(view_) {
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(const size_t& i, value_type& val) const {
+    val += view(i);
+  }
+};
+
+template<class ViewType>
+bool try_std_sort(ViewType view) {
+  bool possible = true;
+#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+  size_t stride[8];
+  view.stride(stride);
+#else
+  size_t stride[8] = { view.stride_0()
+                     , view.stride_1()
+                     , view.stride_2()
+                     , view.stride_3()
+                     , view.stride_4()
+                     , view.stride_5()
+                     , view.stride_6()
+                     , view.stride_7()
+                     };
+#endif
+  possible  = possible && Impl::is_same<typename ViewType::memory_space, HostSpace>::value;
+  possible  = possible && (ViewType::Rank == 1);
+  possible  = possible && (stride[0] == 1);
+  if(possible)  {
+   std::sort(view.ptr_on_device(),view.ptr_on_device()+view.dimension_0());
+  }
+  return possible;
+}
+
+}
+
+template<class ViewType>
+void sort(ViewType view, bool always_use_kokkos_sort = false) {
+  if(!always_use_kokkos_sort) {
+    if(SortImpl::try_std_sort(view)) return;
+  }
+
+  typedef SortImpl::DefaultBinOp1D<ViewType> CompType;
+  SortImpl::min_max<typename ViewType::non_const_value_type> val;
+  parallel_reduce(view.dimension_0(),SortImpl::min_max_functor<ViewType>(view),val);
+  BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,val.min,val.max),true);
+  bin_sort.create_permute_vector();
+  bin_sort.sort(view);
+}
+
+/*template<class ViewType, class Comparator>
+void sort(ViewType view, Comparator comp, bool always_use_kokkos_sort = false) {
+
+}*/
+
+}
+
+#endif
diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile
new file mode 100755
index 0000000000000000000000000000000000000000..5fc94ac0f82f22cca2e070f8f68f94dd8075a052
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/Makefile
@@ -0,0 +1,92 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/algorithms/unit_tests
+
+default: build_all
+	echo "End Build"
+	
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = nvcc_wrapper
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests
+
+TEST_TARGETS = 
+TARGETS = 
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+	OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosAlgorithms_UnitTest_Serial
+	TEST_TARGETS += test-serial
+endif
+
+KokkosAlgorithms_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Cuda
+
+KokkosAlgorithms_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Threads
+	
+KokkosAlgorithms_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_OpenMP
+
+KokkosAlgorithms_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosAlgorithms_UnitTest_Serial
+
+test-cuda: KokkosAlgorithms_UnitTest_Cuda
+	./KokkosAlgorithms_UnitTest_Cuda
+
+test-threads: KokkosAlgorithms_UnitTest_Threads
+	./KokkosAlgorithms_UnitTest_Threads
+
+test-openmp: KokkosAlgorithms_UnitTest_OpenMP
+	./KokkosAlgorithms_UnitTest_OpenMP
+
+test-serial: KokkosAlgorithms_UnitTest_Serial
+	./KokkosAlgorithms_UnitTest_Serial
+	
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+	
+clean: kokkos-clean 
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/lib/kokkos/algorithms/unit_tests/TestCuda.cpp b/lib/kokkos/algorithms/unit_tests/TestCuda.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..d19c778c4663bff82e50037d2d1b6ffaeeff103d
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestCuda.cpp
@@ -0,0 +1,110 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdint.h>
+#include <iostream>
+#include <iomanip>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+
+namespace Test {
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+void cuda_test_random_xorshift64( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Cuda> >(num_draws);
+}
+
+void cuda_test_random_xorshift1024( int num_draws  )
+{
+  Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Cuda> >(num_draws);
+}
+
+
+#define CUDA_RANDOM_XORSHIFT64( num_draws )                                \
+  TEST_F( cuda, Random_XorShift64 ) {   \
+  cuda_test_random_xorshift64(num_draws);                                   \
+  }
+
+#define CUDA_RANDOM_XORSHIFT1024( num_draws )                                \
+  TEST_F( cuda, Random_XorShift1024 ) {   \
+  cuda_test_random_xorshift1024(num_draws);                                   \
+  }
+
+#define CUDA_SORT_UNSIGNED( size )                                \
+  TEST_F( cuda, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::Cuda, unsigned >(size);                                   \
+  }
+
+CUDA_RANDOM_XORSHIFT64(  132141141 )
+CUDA_RANDOM_XORSHIFT1024( 52428813 )
+CUDA_SORT_UNSIGNED(171)
+
+#undef CUDA_RANDOM_XORSHIFT64
+#undef CUDA_RANDOM_XORSHIFT1024
+#undef CUDA_SORT_UNSIGNED
+}
+
+#endif  /* #ifdef KOKKOS_HAVE_CUDA */
+
diff --git a/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..4b06dffcb6a068503770229091ab15330bf6af89
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP.cpp
@@ -0,0 +1,102 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+namespace Test {
+
+#ifdef KOKKOS_HAVE_OPENMP
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned threads_count = omp_get_max_threads();
+
+    if ( Kokkos::hwloc::available() ) {
+      threads_count = Kokkos::hwloc::get_available_numa_count() *
+                      Kokkos::hwloc::get_available_cores_per_numa();
+    }
+
+    Kokkos::OpenMP::initialize( threads_count );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMP::finalize();
+  }
+};
+
+#define OPENMP_RANDOM_XORSHIFT64( num_draws )                                \
+  TEST_F( openmp, Random_XorShift64 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::OpenMP> >(num_draws);                                   \
+  }
+
+#define OPENMP_RANDOM_XORSHIFT1024( num_draws )                                \
+  TEST_F( openmp, Random_XorShift1024 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::OpenMP> >(num_draws);                                   \
+  }
+
+#define OPENMP_SORT_UNSIGNED( size )                                \
+  TEST_F( openmp, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::OpenMP, unsigned >(size);                                   \
+  }
+
+OPENMP_RANDOM_XORSHIFT64( 10240000 )
+OPENMP_RANDOM_XORSHIFT1024( 10130144 )
+OPENMP_SORT_UNSIGNED(171)
+
+#undef OPENMP_RANDOM_XORSHIFT64
+#undef OPENMP_RANDOM_XORSHIFT1024
+#undef OPENMP_SORT_UNSIGNED
+#endif
+} // namespace test
+
diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..eade74ed93074dc0f25d9a8fcd810ff436fc5523
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp
@@ -0,0 +1,476 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_DUALVIEW_HPP
+#define KOKKOS_TEST_DUALVIEW_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <impl/Kokkos_Timer.hpp>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Random.hpp>
+#include <cmath>
+
+namespace Test {
+
+namespace Impl{
+
+// This test runs the random number generators and uses some statistic tests to
+// check the 'goodness' of the random numbers:
+//    (i)   mean:         the mean is expected to be 0.5*RAND_MAX
+//    (ii)  variance:     the variance is 1/3*mean*mean
+//    (iii) covariance:   the covariance is 0
+//    (iv)  1-tupledistr: the mean, variance and covariance of a 1D Histrogram of random numbers
+//    (v)   3-tupledistr: the mean, variance and covariance of a 3D Histrogram of random numbers
+
+#define HIST_DIM3D 24
+#define HIST_DIM1D (HIST_DIM3D*HIST_DIM3D*HIST_DIM3D)
+
+struct RandomProperties {
+  uint64_t count;
+  double mean;
+  double variance;
+  double covariance;
+  double min;
+  double max;
+
+  KOKKOS_INLINE_FUNCTION
+  RandomProperties() {
+    count = 0;
+    mean = 0.0;
+    variance = 0.0;
+    covariance = 0.0;
+    min = 1e64;
+    max = -1e64;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  RandomProperties& operator+=(const RandomProperties& add) {
+    count      += add.count;
+    mean       += add.mean;
+    variance   += add.variance;
+    covariance += add.covariance;
+    min         = add.min<min?add.min:min;
+    max         = add.max>max?add.max:max;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator+=(const volatile RandomProperties& add) volatile {
+    count      += add.count;
+    mean       += add.mean;
+    variance   += add.variance;
+    covariance += add.covariance;
+    min         = add.min<min?add.min:min;
+    max         = add.max>max?add.max:max;
+  }
+};
+
+template<class GeneratorPool, class Scalar>
+struct test_random_functor {
+  typedef typename GeneratorPool::generator_type rnd_type;
+
+  typedef RandomProperties value_type;
+  typedef typename GeneratorPool::device_type device_type;
+
+  GeneratorPool rand_pool;
+  const double mean;
+
+  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
+  // an exclusive upper bound on the range of random numbers that
+  // draw() can generate.  However, for the float specialization, some
+  // implementations might violate this upper bound, due to rounding
+  // error.  Just in case, we leave an extra space at the end of each
+  // dimension, in the View types below.
+  typedef Kokkos::View<int[HIST_DIM1D+1],typename GeneratorPool::device_type> type_1d;
+  type_1d density_1d;
+  typedef Kokkos::View<int[HIST_DIM3D+1][HIST_DIM3D+1][HIST_DIM3D+1],typename GeneratorPool::device_type> type_3d;
+  type_3d density_3d;
+
+  test_random_functor (GeneratorPool rand_pool_, type_1d d1d, type_3d d3d) :
+    rand_pool (rand_pool_),
+    mean (0.5*Kokkos::rand<rnd_type,Scalar>::max ()),
+    density_1d (d1d),
+    density_3d (d3d)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, RandomProperties& prop) const {
+    using Kokkos::atomic_fetch_add;
+
+    rnd_type rand_gen = rand_pool.get_state();
+    for (int k = 0; k < 1024; ++k) {
+      const Scalar tmp = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen);
+      prop.count++;
+      prop.mean += tmp;
+      prop.variance += (tmp-mean)*(tmp-mean);
+      const Scalar tmp2 = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen);
+      prop.count++;
+      prop.mean += tmp2;
+      prop.variance += (tmp2-mean)*(tmp2-mean);
+      prop.covariance += (tmp-mean)*(tmp2-mean);
+      const Scalar tmp3 = Kokkos::rand<rnd_type,Scalar>::draw(rand_gen);
+      prop.count++;
+      prop.mean += tmp3;
+      prop.variance += (tmp3-mean)*(tmp3-mean);
+      prop.covariance += (tmp2-mean)*(tmp3-mean);
+
+      // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to
+      // define an exclusive upper bound on the range of random
+      // numbers that draw() can generate.  However, for the float
+      // specialization, some implementations might violate this upper
+      // bound, due to rounding error.  Just in case, we have left an
+      // extra space at the end of each dimension of density_1d and
+      // density_3d.
+      //
+      // Please note that those extra entries might not get counted in
+      // the histograms.  However, if Kokkos::rand is broken and only
+      // returns values of max(), the histograms will still catch this
+      // indirectly, since none of the other values will be filled in.
+
+      const Scalar theMax = Kokkos::rand<rnd_type, Scalar>::max ();
+
+      const uint64_t ind1_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp / theMax);
+      const uint64_t ind2_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp2 / theMax);
+      const uint64_t ind3_1d = static_cast<uint64_t> (1.0 * HIST_DIM1D * tmp3 / theMax);
+
+      const uint64_t ind1_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp / theMax);
+      const uint64_t ind2_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp2 / theMax);
+      const uint64_t ind3_3d = static_cast<uint64_t> (1.0 * HIST_DIM3D * tmp3 / theMax);
+
+      atomic_fetch_add (&density_1d(ind1_1d), 1);
+      atomic_fetch_add (&density_1d(ind2_1d), 1);
+      atomic_fetch_add (&density_1d(ind3_1d), 1);
+      atomic_fetch_add (&density_3d(ind1_3d, ind2_3d, ind3_3d), 1);
+    }
+    rand_pool.free_state(rand_gen);
+  }
+};
+
+template<class DeviceType>
+struct test_histogram1d_functor {
+  typedef RandomProperties value_type;
+  typedef typename DeviceType::execution_space execution_space;
+  typedef typename DeviceType::memory_space memory_space;
+
+  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
+  // an exclusive upper bound on the range of random numbers that
+  // draw() can generate.  However, for the float specialization, some
+  // implementations might violate this upper bound, due to rounding
+  // error.  Just in case, we leave an extra space at the end of each
+  // dimension, in the View type below.
+  typedef Kokkos::View<int[HIST_DIM1D+1], memory_space> type_1d;
+  type_1d density_1d;
+  double mean;
+
+  test_histogram1d_functor (type_1d d1d, int num_draws) :
+    density_1d (d1d),
+    mean (1.0*num_draws/HIST_DIM1D*3)
+  {
+    printf ("Mean: %e\n", mean);
+  }
+
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const typename memory_space::size_type i,
+              RandomProperties& prop) const
+  {
+    typedef typename memory_space::size_type size_type;
+    const double count = density_1d(i);
+    prop.mean += count;
+    prop.variance += 1.0 * (count - mean) * (count - mean);
+    //prop.covariance += 1.0*count*count;
+    prop.min = count < prop.min ? count : prop.min;
+    prop.max = count > prop.max ? count : prop.max;
+    if (i < static_cast<size_type> (HIST_DIM1D-1)) {
+      prop.covariance += (count - mean) * (density_1d(i+1) - mean);
+    }
+  }
+};
+
+template<class DeviceType>
+struct test_histogram3d_functor {
+  typedef RandomProperties value_type;
+  typedef typename DeviceType::execution_space execution_space;
+  typedef typename DeviceType::memory_space memory_space;
+
+  // NOTE (mfh 03 Nov 2014): Kokkos::rand::max() is supposed to define
+  // an exclusive upper bound on the range of random numbers that
+  // draw() can generate.  However, for the float specialization, some
+  // implementations might violate this upper bound, due to rounding
+  // error.  Just in case, we leave an extra space at the end of each
+  // dimension, in the View type below.
+  typedef Kokkos::View<int[HIST_DIM3D+1][HIST_DIM3D+1][HIST_DIM3D+1], memory_space> type_3d;
+  type_3d density_3d;
+  double mean;
+
+  test_histogram3d_functor (type_3d d3d, int num_draws) :
+    density_3d (d3d),
+    mean (1.0*num_draws/HIST_DIM1D)
+  {}
+
+  KOKKOS_INLINE_FUNCTION void
+  operator() (const typename memory_space::size_type i,
+              RandomProperties& prop) const
+  {
+    typedef typename memory_space::size_type size_type;
+    const double count = density_3d(i/(HIST_DIM3D*HIST_DIM3D),
+                                    (i % (HIST_DIM3D*HIST_DIM3D))/HIST_DIM3D,
+                                    i % HIST_DIM3D);
+    prop.mean += count;
+    prop.variance += (count - mean) * (count - mean);
+    if (i < static_cast<size_type> (HIST_DIM1D-1)) {
+      const double count_next = density_3d((i+1)/(HIST_DIM3D*HIST_DIM3D),
+                                           ((i+1)%(HIST_DIM3D*HIST_DIM3D))/HIST_DIM3D,
+                                           (i+1)%HIST_DIM3D);
+      prop.covariance += (count - mean) * (count_next - mean);
+    }
+  }
+};
+
+//
+// Templated test that uses the above functors.
+//
+template <class RandomGenerator,class Scalar>
+struct test_random_scalar {
+  typedef typename RandomGenerator::generator_type rnd_type;
+
+  int pass_mean,pass_var,pass_covar;
+  int pass_hist1d_mean,pass_hist1d_var,pass_hist1d_covar;
+  int pass_hist3d_mean,pass_hist3d_var,pass_hist3d_covar;
+
+  test_random_scalar (typename test_random_functor<RandomGenerator,int>::type_1d& density_1d,
+                      typename test_random_functor<RandomGenerator,int>::type_3d& density_3d,
+                      RandomGenerator& pool,
+                      unsigned int num_draws)
+  {
+    using std::cerr;
+    using std::endl;
+    using Kokkos::parallel_reduce;
+
+    {
+      cerr << " -- Testing randomness properties" << endl;
+
+      RandomProperties result;
+      typedef test_random_functor<RandomGenerator, Scalar> functor_type;
+      parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result);
+
+      //printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2);
+      double tolerance = 2.0*sqrt(1.0/num_draws);
+      double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max();
+      double variance_expect = 1.0/3.0*mean_expect*mean_expect;
+      double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0;
+      double variance_eps = variance_expect/(result.variance/num_draws/3)-1.0;
+      double covariance_eps = result.covariance/num_draws/2/variance_expect;
+      pass_mean  = ((-tolerance < mean_eps) &&
+                    ( tolerance > mean_eps)) ? 1:0;
+      pass_var   = ((-tolerance < variance_eps) &&
+                    ( tolerance > variance_eps)) ? 1:0;
+      pass_covar = ((-1.4*tolerance < covariance_eps) &&
+                    ( 1.4*tolerance > covariance_eps)) ? 1:0;
+      cerr << "Pass: " << pass_mean
+           << " " << pass_var
+           << " " << mean_eps
+           << " " << variance_eps
+           << " " << covariance_eps
+           << " || " << tolerance << endl;
+    }
+    {
+      cerr << " -- Testing 1-D histogram" << endl;
+
+      RandomProperties result;
+      typedef test_histogram1d_functor<typename RandomGenerator::device_type> functor_type;
+      parallel_reduce (HIST_DIM1D, functor_type (density_1d, num_draws), result);
+
+      double tolerance = 6*sqrt(1.0/HIST_DIM1D);
+      double mean_expect = 1.0*num_draws*3/HIST_DIM1D;
+      double variance_expect = 1.0*num_draws*3/HIST_DIM1D*(1.0-1.0/HIST_DIM1D);
+      double covariance_expect = -1.0*num_draws*3/HIST_DIM1D/HIST_DIM1D;
+      double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
+      double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
+      double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
+      pass_hist1d_mean  = ((-tolerance < mean_eps) &&
+                           ( tolerance > mean_eps)) ? 1:0;
+      pass_hist1d_var   = ((-tolerance < variance_eps) &&
+                           ( tolerance > variance_eps)) ? 1:0;
+      pass_hist1d_covar = ((-tolerance < covariance_eps) &&
+                           ( tolerance > covariance_eps)) ? 1:0;
+
+      cerr << "Density 1D: " << mean_eps
+           << " " << variance_eps
+           << " " << (result.covariance/HIST_DIM1D/HIST_DIM1D)
+           << " || " << tolerance
+           << " " << result.min
+           << " " << result.max
+           << " || " << result.variance/HIST_DIM1D
+           << " " << 1.0*num_draws*3/HIST_DIM1D*(1.0-1.0/HIST_DIM1D)
+           << " || " << result.covariance/HIST_DIM1D
+           << " " << -1.0*num_draws*3/HIST_DIM1D/HIST_DIM1D
+           << endl;
+    }
+    {
+      cerr << " -- Testing 3-D histogram" << endl;
+
+      RandomProperties result;
+      typedef test_histogram3d_functor<typename RandomGenerator::device_type> functor_type;
+      parallel_reduce (HIST_DIM1D, functor_type (density_3d, num_draws), result);
+
+      double tolerance = 6*sqrt(1.0/HIST_DIM1D);
+      double mean_expect = 1.0*num_draws/HIST_DIM1D;
+      double variance_expect = 1.0*num_draws/HIST_DIM1D*(1.0-1.0/HIST_DIM1D);
+      double covariance_expect = -1.0*num_draws/HIST_DIM1D/HIST_DIM1D;
+      double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
+      double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
+      double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
+      pass_hist3d_mean  = ((-tolerance < mean_eps) &&
+                           ( tolerance > mean_eps)) ? 1:0;
+      pass_hist3d_var   = ((-tolerance < variance_eps) &&
+                           ( tolerance > variance_eps)) ? 1:0;
+      pass_hist3d_covar = ((-tolerance < covariance_eps) &&
+                           ( tolerance > covariance_eps)) ? 1:0;
+
+      cerr << "Density 3D: " << mean_eps
+           << " " << variance_eps
+           << " " << result.covariance/HIST_DIM1D/HIST_DIM1D
+           << " || " << tolerance
+           << " " << result.min
+           << " " << result.max << endl;
+    }
+  }
+};
+
+template <class RandomGenerator>
+void test_random(unsigned int num_draws)
+{
+  using std::cerr;
+  using std::endl;
+  typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
+  typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
+
+  cerr << "Test Scalar=int" << endl;
+  RandomGenerator pool(31891);
+  test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_int.pass_mean,1);
+  ASSERT_EQ( test_int.pass_var,1);
+  ASSERT_EQ( test_int.pass_covar,1);
+  ASSERT_EQ( test_int.pass_hist1d_mean,1);
+  ASSERT_EQ( test_int.pass_hist1d_var,1);
+  ASSERT_EQ( test_int.pass_hist1d_covar,1);
+  ASSERT_EQ( test_int.pass_hist3d_mean,1);
+  ASSERT_EQ( test_int.pass_hist3d_var,1);
+  ASSERT_EQ( test_int.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cerr << "Test Scalar=unsigned int" << endl;
+  test_random_scalar<RandomGenerator,unsigned int> test_uint(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_uint.pass_mean,1);
+  ASSERT_EQ( test_uint.pass_var,1);
+  ASSERT_EQ( test_uint.pass_covar,1);
+  ASSERT_EQ( test_uint.pass_hist1d_mean,1);
+  ASSERT_EQ( test_uint.pass_hist1d_var,1);
+  ASSERT_EQ( test_uint.pass_hist1d_covar,1);
+  ASSERT_EQ( test_uint.pass_hist3d_mean,1);
+  ASSERT_EQ( test_uint.pass_hist3d_var,1);
+  ASSERT_EQ( test_uint.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cerr << "Test Scalar=int64_t" << endl;
+  test_random_scalar<RandomGenerator,int64_t> test_int64(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_int64.pass_mean,1);
+  ASSERT_EQ( test_int64.pass_var,1);
+  ASSERT_EQ( test_int64.pass_covar,1);
+  ASSERT_EQ( test_int64.pass_hist1d_mean,1);
+  ASSERT_EQ( test_int64.pass_hist1d_var,1);
+  ASSERT_EQ( test_int64.pass_hist1d_covar,1);
+  ASSERT_EQ( test_int64.pass_hist3d_mean,1);
+  ASSERT_EQ( test_int64.pass_hist3d_var,1);
+  ASSERT_EQ( test_int64.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cerr << "Test Scalar=uint64_t" << endl;
+  test_random_scalar<RandomGenerator,uint64_t> test_uint64(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_uint64.pass_mean,1);
+  ASSERT_EQ( test_uint64.pass_var,1);
+  ASSERT_EQ( test_uint64.pass_covar,1);
+  ASSERT_EQ( test_uint64.pass_hist1d_mean,1);
+  ASSERT_EQ( test_uint64.pass_hist1d_var,1);
+  ASSERT_EQ( test_uint64.pass_hist1d_covar,1);
+  ASSERT_EQ( test_uint64.pass_hist3d_mean,1);
+  ASSERT_EQ( test_uint64.pass_hist3d_var,1);
+  ASSERT_EQ( test_uint64.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cerr << "Test Scalar=float" << endl;
+  test_random_scalar<RandomGenerator,float> test_float(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_float.pass_mean,1);
+  ASSERT_EQ( test_float.pass_var,1);
+  ASSERT_EQ( test_float.pass_covar,1);
+  ASSERT_EQ( test_float.pass_hist1d_mean,1);
+  ASSERT_EQ( test_float.pass_hist1d_var,1);
+  ASSERT_EQ( test_float.pass_hist1d_covar,1);
+  ASSERT_EQ( test_float.pass_hist3d_mean,1);
+  ASSERT_EQ( test_float.pass_hist3d_var,1);
+  ASSERT_EQ( test_float.pass_hist3d_covar,1);
+  deep_copy(density_1d,0);
+  deep_copy(density_3d,0);
+
+  cerr << "Test Scalar=double" << endl;
+  test_random_scalar<RandomGenerator,double> test_double(density_1d,density_3d,pool,num_draws);
+  ASSERT_EQ( test_double.pass_mean,1);
+  ASSERT_EQ( test_double.pass_var,1);
+  ASSERT_EQ( test_double.pass_covar,1);
+  ASSERT_EQ( test_double.pass_hist1d_mean,1);
+  ASSERT_EQ( test_double.pass_hist1d_var,1);
+  ASSERT_EQ( test_double.pass_hist1d_covar,1);
+  ASSERT_EQ( test_double.pass_hist3d_mean,1);
+  ASSERT_EQ( test_double.pass_hist3d_var,1);
+  ASSERT_EQ( test_double.pass_hist3d_covar,1);
+}
+}
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/algorithms/unit_tests/TestSerial.cpp b/lib/kokkos/algorithms/unit_tests/TestSerial.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..741cf97ae13f245fafeb95078222943afda8ed1d
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestSerial.cpp
@@ -0,0 +1,99 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+
+//----------------------------------------------------------------------------
+
+
+namespace Test {
+
+#ifdef KOKKOS_HAVE_SERIAL
+class serial : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision (5) << std::scientific;
+    Kokkos::Serial::initialize ();
+  }
+
+  static void TearDownTestCase ()
+  {
+    Kokkos::Serial::finalize ();
+  }
+};
+
+#define SERIAL_RANDOM_XORSHIFT64( num_draws )  \
+  TEST_F( serial, Random_XorShift64 ) {                                \
+    Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Serial> >(num_draws); \
+  }
+
+#define SERIAL_RANDOM_XORSHIFT1024( num_draws )        \
+  TEST_F( serial, Random_XorShift1024 ) {                              \
+    Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Serial> >(num_draws); \
+  }
+
+#define SERIAL_SORT_UNSIGNED( size )                                \
+  TEST_F( serial, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::Serial, unsigned >(size);                                   \
+  }
+
+SERIAL_RANDOM_XORSHIFT64( 10240000 )
+SERIAL_RANDOM_XORSHIFT1024( 10130144 )
+SERIAL_SORT_UNSIGNED(171)
+
+#undef SERIAL_RANDOM_XORSHIFT64
+#undef SERIAL_RANDOM_XORSHIFT1024
+#undef SERIAL_SORT_UNSIGNED
+
+#endif // KOKKOS_HAVE_SERIAL
+} // namespace Test
+
+
diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..ccbcbdd0011bbc577ac8c39b2f593ed35f2546ac
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp
@@ -0,0 +1,206 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef TESTSORT_HPP_
+#define TESTSORT_HPP_
+
+#include <gtest/gtest.h>
+#include<Kokkos_Core.hpp>
+#include<Kokkos_Random.hpp>
+#include<Kokkos_Sort.hpp>
+
+namespace Test {
+
+namespace Impl{
+
+template<class ExecutionSpace, class Scalar>
+struct is_sorted_struct {
+  typedef unsigned int value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*,ExecutionSpace> keys;
+
+  is_sorted_struct(Kokkos::View<Scalar*,ExecutionSpace> keys_):keys(keys_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, unsigned int& count) const {
+    if(keys(i)>keys(i+1)) count++;
+  }
+};
+
+template<class ExecutionSpace, class Scalar>
+struct sum {
+  typedef double value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*,ExecutionSpace> keys;
+
+  sum(Kokkos::View<Scalar*,ExecutionSpace> keys_):keys(keys_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, double& count) const {
+    count+=keys(i);
+  }
+};
+
+template<class ExecutionSpace, class Scalar>
+struct bin3d_is_sorted_struct {
+  typedef unsigned int value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*[3],ExecutionSpace> keys;
+
+  int max_bins;
+  Scalar min;
+  Scalar max;
+
+  bin3d_is_sorted_struct(Kokkos::View<Scalar*[3],ExecutionSpace> keys_,int max_bins_,Scalar min_,Scalar max_):
+    keys(keys_),max_bins(max_bins_),min(min_),max(max_) {
+  }
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, unsigned int& count) const {
+    int ix1 = int ((keys(i,0)-min)/max * max_bins);
+    int iy1 = int ((keys(i,1)-min)/max * max_bins);
+    int iz1 = int ((keys(i,2)-min)/max * max_bins);
+    int ix2 = int ((keys(i+1,0)-min)/max * max_bins);
+    int iy2 = int ((keys(i+1,1)-min)/max * max_bins);
+    int iz2 = int ((keys(i+1,2)-min)/max * max_bins);
+
+    if (ix1>ix2)  count++;
+    else if(ix1==ix2) {
+      if (iy1>iy2)  count++;
+      else if ((iy1==iy2) && (iz1>iz2))  count++;
+    }
+  }
+};
+
+template<class ExecutionSpace, class Scalar>
+struct sum3D {
+  typedef double value_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<Scalar*[3],ExecutionSpace> keys;
+
+  sum3D(Kokkos::View<Scalar*[3],ExecutionSpace> keys_):keys(keys_) {}
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, double& count) const {
+    count+=keys(i,0);
+    count+=keys(i,1);
+    count+=keys(i,2);
+  }
+};
+
+template<class ExecutionSpace, typename KeyType>
+void test_1D_sort(unsigned int n,bool force_kokkos) {
+  typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
+  KeyViewType keys("Keys",n);
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
+
+  double sum_before = 0.0;
+  double sum_after = 0.0;
+  unsigned int sort_fails = 0;
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys),sum_before);
+
+  Kokkos::sort(keys,force_kokkos);
+
+  Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys),sum_after);
+  Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys),sort_fails);
+
+  double ratio = sum_before/sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
+
+  ASSERT_EQ(sort_fails,0);
+  ASSERT_EQ(equal_sum,1);
+}
+
+template<class ExecutionSpace, typename KeyType>
+void test_3D_sort(unsigned int n) {
+  typedef Kokkos::View<KeyType*[3],ExecutionSpace > KeyViewType;
+
+  KeyViewType keys("Keys",n*n*n);
+
+  Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
+  Kokkos::fill_random(keys,g,100.0);
+
+  double sum_before = 0.0;
+  double sum_after = 0.0;
+  unsigned int sort_fails = 0;
+
+  Kokkos::parallel_reduce(keys.dimension_0(),sum3D<ExecutionSpace, KeyType>(keys),sum_before);
+
+  int bin_1d = 1;
+  while( bin_1d*bin_1d*bin_1d*4< (int) keys.dimension_0() ) bin_1d*=2;
+  int bin_max[3] = {bin_1d,bin_1d,bin_1d};
+  typename KeyViewType::value_type min[3] = {0,0,0};
+  typename KeyViewType::value_type max[3] = {100,100,100};
+
+  typedef Kokkos::SortImpl::DefaultBinOp3D< KeyViewType > BinOp;
+  BinOp bin_op(bin_max,min,max);
+  Kokkos::BinSort< KeyViewType , BinOp >
+    Sorter(keys,bin_op,false);
+  Sorter.create_permute_vector();
+  Sorter.template sort< KeyViewType >(keys);
+
+  Kokkos::parallel_reduce(keys.dimension_0(),sum3D<ExecutionSpace, KeyType>(keys),sum_after);
+  Kokkos::parallel_reduce(keys.dimension_0()-1,bin3d_is_sorted_struct<ExecutionSpace, KeyType>(keys,bin_1d,min[0],max[0]),sort_fails);
+
+  double ratio = sum_before/sum_after;
+  double epsilon = 1e-10;
+  unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
+
+  printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails);
+  ASSERT_EQ(sort_fails,0);
+  ASSERT_EQ(equal_sum,1);
+}
+
+template<class ExecutionSpace, typename KeyType>
+void test_sort(unsigned int N)
+{
+  test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true);
+  test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false);
+  test_3D_sort<ExecutionSpace,KeyType>(N);
+}
+
+}
+}
+#endif /* TESTSORT_HPP_ */
diff --git a/lib/kokkos/algorithms/unit_tests/TestThreads.cpp b/lib/kokkos/algorithms/unit_tests/TestThreads.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..a61d6c8bd59bb9758f7ff30124b048150ac0cb92
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/TestThreads.cpp
@@ -0,0 +1,113 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <TestRandom.hpp>
+#include <TestSort.hpp>
+#include <iomanip>
+
+
+//----------------------------------------------------------------------------
+
+
+namespace Test {
+
+#ifdef KOKKOS_HAVE_PTHREAD
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned num_threads = 4;
+
+    if (Kokkos::hwloc::available()) {
+      num_threads = Kokkos::hwloc::get_available_numa_count()
+                    * Kokkos::hwloc::get_available_cores_per_numa()
+                 // * Kokkos::hwloc::get_available_threads_per_core()
+                    ;
+
+    }
+
+    std::cout << "Threads: " << num_threads << std::endl;
+
+    Kokkos::Threads::initialize( num_threads );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Threads::finalize();
+  }
+};
+
+#define THREADS_RANDOM_XORSHIFT64( num_draws )                                \
+  TEST_F( threads, Random_XorShift64 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift64_Pool<Kokkos::Threads> >(num_draws);                                   \
+  }
+
+#define THREADS_RANDOM_XORSHIFT1024( num_draws )                                \
+  TEST_F( threads, Random_XorShift1024 ) {   \
+      Impl::test_random<Kokkos::Random_XorShift1024_Pool<Kokkos::Threads> >(num_draws);                                   \
+  }
+
+#define THREADS_SORT_UNSIGNED( size )                                \
+  TEST_F( threads, SortUnsigned ) {   \
+      Impl::test_sort< Kokkos::Threads, double >(size);                                   \
+  }
+
+
+THREADS_RANDOM_XORSHIFT64( 10240000 )
+THREADS_RANDOM_XORSHIFT1024( 10130144 )
+THREADS_SORT_UNSIGNED(171)
+
+#undef THREADS_RANDOM_XORSHIFT64
+#undef THREADS_RANDOM_XORSHIFT1024
+#undef THREADS_SORT_UNSIGNED
+
+#endif
+} // namespace Test
+
+
diff --git a/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87
--- /dev/null
+++ b/lib/kokkos/algorithms/unit_tests/UnitTestMain.cpp
@@ -0,0 +1,50 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/lib/kokkos/containers/performance_tests/Makefile b/lib/kokkos/containers/performance_tests/Makefile
new file mode 100755
index 0000000000000000000000000000000000000000..7ced9452826bc72cd957d7a1943bf55f9f01285d
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/Makefile
@@ -0,0 +1,81 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/containers/performance_tests
+
+default: build_all
+	echo "End Build"
+	
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = nvcc_wrapper
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/performance_tests
+
+TEST_TARGETS = 
+TARGETS = 
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP.o TestMain.o gtest-all.o
+	TARGETS += KokkosContainers_PerformanceTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda
+
+KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads
+	
+KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_OpenMP
+
+test-cuda: KokkosContainers_PerformanceTest_Cuda
+	./KokkosContainers_PerformanceTest_Cuda
+
+test-threads: KokkosContainers_PerformanceTest_Threads
+	./KokkosContainers_PerformanceTest_Threads
+
+test-openmp: KokkosContainers_PerformanceTest_OpenMP
+	./KokkosContainers_PerformanceTest_OpenMP
+
+	
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+	
+clean: kokkos-clean 
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/lib/kokkos/containers/performance_tests/TestCuda.cpp b/lib/kokkos/containers/performance_tests/TestCuda.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..aee262de93eecfe79314e217252bbcd15a847353
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestCuda.cpp
@@ -0,0 +1,100 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdint.h>
+#include <string>
+#include <iostream>
+#include <iomanip>
+#include <sstream>
+#include <fstream>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <TestGlobal2LocalIds.hpp>
+
+#include <TestUnorderedMapPerformance.hpp>
+
+namespace Performance {
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+TEST_F( cuda, global_2_local)
+{
+  std::cout << "Cuda" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::Cuda>(i);
+}
+
+TEST_F( cuda, unordered_map_performance_near)
+{
+  Perf::run_performance_tests<Kokkos::Cuda,true>("cuda-near");
+}
+
+TEST_F( cuda, unordered_map_performance_far)
+{
+  Perf::run_performance_tests<Kokkos::Cuda,false>("cuda-far");
+}
+
+}
+
+#endif  /* #if defined( KOKKOS_HAVE_CUDA ) */
diff --git a/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..fb70b8fe2e9dc8c0a0cc5ed6787b8afa86e666df
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestGlobal2LocalIds.hpp
@@ -0,0 +1,231 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP
+#define KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <vector>
+#include <algorithm>
+
+#include <impl/Kokkos_Timer.hpp>
+
+// This test will simulate global ids
+
+namespace Performance {
+
+static const unsigned begin_id_size = 256u;
+static const unsigned end_id_size = 1u << 22;
+static const unsigned id_step = 2u;
+
+union helper
+{
+  uint32_t word;
+  uint8_t byte[4];
+};
+
+
+template <typename Device>
+struct generate_ids
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
+
+  local_id_view local_2_global;
+
+  generate_ids( local_id_view & ids)
+    : local_2_global(ids)
+  {
+    Kokkos::parallel_for(local_2_global.dimension_0(), *this);
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+
+    helper x = {static_cast<uint32_t>(i)};
+
+    // shuffle the bytes of i to create a unique, semi-random global_id
+    x.word = ~x.word;
+
+    uint8_t tmp = x.byte[3];
+    x.byte[3] = x.byte[1];
+    x.byte[1] = tmp;
+
+    tmp = x.byte[2];
+    x.byte[2] = x.byte[0];
+    x.byte[0] = tmp;
+
+    local_2_global[i] = x.word;
+  }
+
+};
+
+template <typename Device>
+struct fill_map
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
+  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
+
+  global_id_view global_2_local;
+  local_id_view local_2_global;
+
+  fill_map( global_id_view gIds, local_id_view lIds)
+    : global_2_local(gIds) , local_2_global(lIds)
+  {
+    Kokkos::parallel_for(local_2_global.dimension_0(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+    global_2_local.insert( local_2_global[i], i);
+  }
+
+};
+
+template <typename Device>
+struct find_test
+{
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+  typedef Kokkos::View<const uint32_t*,execution_space, Kokkos::MemoryRandomAccess> local_id_view;
+  typedef Kokkos::UnorderedMap<const uint32_t, const size_type,execution_space> global_id_view;
+
+  global_id_view global_2_local;
+  local_id_view local_2_global;
+
+  typedef size_t value_type;
+
+  find_test( global_id_view gIds, local_id_view lIds, value_type & num_errors)
+    : global_2_local(gIds) , local_2_global(lIds)
+  {
+    Kokkos::parallel_reduce(local_2_global.dimension_0(), *this, num_errors);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type & v) const
+  { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type & dst, volatile value_type const & src) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i, value_type & num_errors) const
+  {
+    uint32_t index = global_2_local.find( local_2_global[i] );
+
+    if ( global_2_local.value_at(index) != i) ++num_errors;
+  }
+
+};
+
+template <typename Device>
+void test_global_to_local_ids(unsigned num_ids)
+{
+
+  typedef Device execution_space;
+  typedef typename execution_space::size_type size_type;
+
+  typedef Kokkos::View<uint32_t*,execution_space> local_id_view;
+  typedef Kokkos::UnorderedMap<uint32_t,size_type,execution_space> global_id_view;
+
+  //size
+  std::cout << num_ids << ", ";
+
+  double elasped_time = 0;
+  Kokkos::Impl::Timer timer;
+
+  local_id_view local_2_global("local_ids", num_ids);
+  global_id_view global_2_local((3u*num_ids)/2u);
+
+  //create
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << ", ";
+  timer.reset();
+
+  // generate unique ids
+  {
+    generate_ids<Device> gen(local_2_global);
+  }
+  Device::fence();
+  // generate
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << ", ";
+  timer.reset();
+
+  {
+    fill_map<Device> fill(global_2_local, local_2_global);
+  }
+  Device::fence();
+
+  // fill
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << ", ";
+  timer.reset();
+
+
+  size_t num_errors = 0;
+  for (int i=0; i<100; ++i)
+  {
+    find_test<Device> find(global_2_local, local_2_global,num_errors);
+  }
+  Device::fence();
+
+  // find
+  elasped_time = timer.seconds();
+  std::cout << elasped_time << std::endl;
+
+  ASSERT_EQ( num_errors, 0u);
+}
+
+
+} // namespace Performance
+
+
+#endif //KOKKOS_TEST_GLOBAL_TO_LOCAL_IDS_HPP
+
diff --git a/lib/kokkos/containers/performance_tests/TestMain.cpp b/lib/kokkos/containers/performance_tests/TestMain.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestMain.cpp
@@ -0,0 +1,50 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..82a9311df71108d2f05b6020a31764f91be36600
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp
@@ -0,0 +1,131 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <TestGlobal2LocalIds.hpp>
+#include <TestUnorderedMapPerformance.hpp>
+
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <fstream>
+
+
+namespace Performance {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned num_threads = 4;
+
+    if (Kokkos::hwloc::available()) {
+      num_threads = Kokkos::hwloc::get_available_numa_count()
+                    * Kokkos::hwloc::get_available_cores_per_numa()
+                    * Kokkos::hwloc::get_available_threads_per_core()
+                    ;
+
+    }
+
+    std::cout << "OpenMP: " << num_threads << std::endl;
+
+    Kokkos::OpenMP::initialize( num_threads );
+
+    std::cout << "available threads: " << omp_get_max_threads() << std::endl;
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMP::finalize();
+
+    omp_set_num_threads(1);
+
+    ASSERT_EQ( 1 , omp_get_max_threads() );
+  }
+};
+
+TEST_F( openmp, global_2_local)
+{
+  std::cout << "OpenMP" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::OpenMP>(i);
+}
+
+TEST_F( openmp, unordered_map_performance_near)
+{
+  unsigned num_openmp = 4;
+  if (Kokkos::hwloc::available()) {
+    num_openmp = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "openmp-" << num_openmp << "-near";
+  Perf::run_performance_tests<Kokkos::OpenMP,true>(base_file_name.str());
+}
+
+TEST_F( openmp, unordered_map_performance_far)
+{
+  unsigned num_openmp = 4;
+  if (Kokkos::hwloc::available()) {
+    num_openmp = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "openmp-" << num_openmp << "-far";
+  Perf::run_performance_tests<Kokkos::OpenMP,false>(base_file_name.str());
+}
+
+} // namespace test
+
diff --git a/lib/kokkos/containers/performance_tests/TestThreads.cpp b/lib/kokkos/containers/performance_tests/TestThreads.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..04d9dc0c187f1006c563e84d55b16780485daec7
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestThreads.cpp
@@ -0,0 +1,126 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <iomanip>
+
+#include <TestGlobal2LocalIds.hpp>
+#include <TestUnorderedMapPerformance.hpp>
+
+#include <iomanip>
+#include <sstream>
+#include <string>
+#include <fstream>
+
+namespace Performance {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned num_threads = 4;
+
+    if (Kokkos::hwloc::available()) {
+      num_threads = Kokkos::hwloc::get_available_numa_count() *
+                    Kokkos::hwloc::get_available_cores_per_numa() *
+                    Kokkos::hwloc::get_available_threads_per_core();
+
+    }
+
+    std::cout << "Threads: " << num_threads << std::endl;
+
+    Kokkos::Threads::initialize( num_threads );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Threads::finalize();
+  }
+};
+
+TEST_F( threads, global_2_local)
+{
+  std::cout << "Threads" << std::endl;
+  std::cout << "size, create, generate, fill, find" << std::endl;
+  for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step)
+    test_global_to_local_ids<Kokkos::Threads>(i);
+}
+
+TEST_F( threads, unordered_map_performance_near)
+{
+  unsigned num_threads = 4;
+  if (Kokkos::hwloc::available()) {
+    num_threads = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "threads-" << num_threads << "-near";
+  Perf::run_performance_tests<Kokkos::Threads,true>(base_file_name.str());
+}
+
+TEST_F( threads, unordered_map_performance_far)
+{
+  unsigned num_threads = 4;
+  if (Kokkos::hwloc::available()) {
+    num_threads = Kokkos::hwloc::get_available_numa_count() *
+                  Kokkos::hwloc::get_available_cores_per_numa() *
+                  Kokkos::hwloc::get_available_threads_per_core();
+
+  }
+  std::ostringstream base_file_name;
+  base_file_name << "threads-" << num_threads << "-far";
+  Perf::run_performance_tests<Kokkos::Threads,false>(base_file_name.str());
+}
+
+} // namespace Performance
+
+
diff --git a/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..975800229cbcb67c6e7e788842a3db06d97f0a21
--- /dev/null
+++ b/lib/kokkos/containers/performance_tests/TestUnorderedMapPerformance.hpp
@@ -0,0 +1,262 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
+#define KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <iostream>
+#include <iomanip>
+#include <fstream>
+#include <string>
+#include <sstream>
+
+
+namespace Perf {
+
+template <typename Device, bool Near>
+struct UnorderedMapTest
+{
+  typedef Device execution_space;
+  typedef Kokkos::UnorderedMap<uint32_t, uint32_t, execution_space> map_type;
+  typedef typename map_type::histogram_type histogram_type;
+
+  struct value_type {
+    uint32_t failed_count;
+    uint32_t max_list;
+  };
+
+  uint32_t capacity;
+  uint32_t inserts;
+  uint32_t collisions;
+  double   seconds;
+  map_type map;
+  histogram_type histogram;
+
+  UnorderedMapTest( uint32_t arg_capacity, uint32_t arg_inserts, uint32_t arg_collisions)
+    : capacity(arg_capacity)
+    , inserts(arg_inserts)
+    , collisions(arg_collisions)
+    , seconds(0)
+    , map(capacity)
+    , histogram(map.get_histogram())
+  {
+    Kokkos::Impl::Timer wall_clock ;
+    wall_clock.reset();
+
+    value_type v = {};
+    int loop_count = 0;
+    do {
+      ++loop_count;
+
+      v = value_type();
+      Kokkos::parallel_reduce(inserts, *this, v);
+
+      if (v.failed_count > 0u) {
+        const uint32_t new_capacity = map.capacity() + ((map.capacity()*3ull)/20u) + v.failed_count/collisions ;
+        map.rehash( new_capacity );
+      }
+    } while (v.failed_count > 0u);
+
+    seconds = wall_clock.seconds();
+
+    switch (loop_count)
+    {
+    case 1u: std::cout << " \033[0;32m" << loop_count << "\033[0m "; break;
+    case 2u: std::cout << " \033[1;31m" << loop_count << "\033[0m "; break;
+    default: std::cout << " \033[0;31m" << loop_count << "\033[0m "; break;
+    }
+    std::cout << std::setprecision(2) << std::fixed << std::setw(5) << (1e9*(seconds/(inserts))) << "; " << std::flush;
+
+    histogram.calculate();
+    Device::fence();
+  }
+
+  void print(std::ostream & metrics_out, std::ostream & length_out, std::ostream & distance_out, std::ostream & block_distance_out)
+  {
+    metrics_out << map.capacity() << " , ";
+    metrics_out << inserts/collisions << " , ";
+    metrics_out << (100.0 * inserts/collisions) / map.capacity() << " , ";
+    metrics_out << inserts << " , ";
+    metrics_out << (map.failed_insert() ? "true" : "false") << " , ";
+    metrics_out << collisions << " , ";
+    metrics_out << 1e9*(seconds/inserts) << " , ";
+    metrics_out << seconds << std::endl;
+
+    length_out << map.capacity() << " , ";
+    length_out << ((100.0 *inserts/collisions) / map.capacity()) << " , ";
+    length_out << collisions << " , ";
+    histogram.print_length(length_out);
+
+    distance_out << map.capacity() << " , ";
+    distance_out << ((100.0 *inserts/collisions) / map.capacity()) << " , ";
+    distance_out << collisions << " , ";
+    histogram.print_distance(distance_out);
+
+    block_distance_out << map.capacity() << " , ";
+    block_distance_out << ((100.0 *inserts/collisions) / map.capacity()) << " , ";
+    block_distance_out << collisions << " , ";
+    histogram.print_block_distance(block_distance_out);
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const
+  {
+    v.failed_count = 0;
+    v.max_list = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  {
+    dst.failed_count += src.failed_count;
+    dst.max_list = src.max_list < dst.max_list ? dst.max_list : src.max_list;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    const uint32_t key = Near ? i/collisions : i%(inserts/collisions);
+    typename map_type::insert_result result = map.insert(key,i);
+    v.failed_count += !result.failed() ? 0 : 1;
+    v.max_list = result.list_position() < v.max_list ? v.max_list : result.list_position();
+  }
+
+};
+
+//#define KOKKOS_COLLECT_UNORDERED_MAP_METRICS
+
+template <typename Device, bool Near>
+void run_performance_tests(std::string const & base_file_name)
+{
+#if defined(KOKKOS_COLLECT_UNORDERED_MAP_METRICS)
+  std::string metrics_file_name = base_file_name + std::string("-metrics.csv");
+  std::string length_file_name = base_file_name  + std::string("-length.csv");
+  std::string distance_file_name = base_file_name + std::string("-distance.csv");
+  std::string block_distance_file_name = base_file_name + std::string("-block_distance.csv");
+
+  std::ofstream metrics_out( metrics_file_name.c_str(), std::ofstream::out );
+  std::ofstream length_out( length_file_name.c_str(), std::ofstream::out );
+  std::ofstream distance_out( distance_file_name.c_str(), std::ofstream::out );
+  std::ofstream block_distance_out( block_distance_file_name.c_str(), std::ofstream::out );
+
+
+  /*
+  const double test_ratios[] = {
+     0.50
+   , 0.75
+   , 0.80
+   , 0.85
+   , 0.90
+   , 0.95
+   , 1.00
+   , 1.25
+   , 2.00
+  };
+  */
+
+  const double test_ratios[] = { 1.00 };
+
+  const int num_ratios = sizeof(test_ratios) / sizeof(double);
+
+  /*
+  const uint32_t collisions[] {
+      1
+    , 4
+    , 16
+    , 64
+  };
+  */
+
+  const uint32_t collisions[] = { 16 };
+
+  const int num_collisions = sizeof(collisions) / sizeof(uint32_t);
+
+  // set up file headers
+  metrics_out << "Capacity , Unique , Percent Full , Attempted Inserts , Failed Inserts , Collision Ratio , Nanoseconds/Inserts, Seconds" << std::endl;
+  length_out << "Capacity , Percent Full , ";
+  distance_out << "Capacity , Percent Full , ";
+  block_distance_out << "Capacity , Percent Full , ";
+
+  for (int i=0; i<100; ++i) {
+    length_out << i << " , ";
+    distance_out << i << " , ";
+    block_distance_out << i << " , ";
+  }
+
+  length_out << "\b\b\b   " << std::endl;
+  distance_out << "\b\b\b   " << std::endl;
+  block_distance_out << "\b\b\b   " << std::endl;
+
+  Kokkos::Impl::Timer wall_clock ;
+  for (int i=0;  i < num_collisions ; ++i) {
+    wall_clock.reset();
+    std::cout << "Collisions: " << collisions[i] << std::endl;
+    for (int j = 0; j < num_ratios; ++j) {
+      std::cout << std::setprecision(1) << std::fixed << std::setw(5) << (100.0*test_ratios[j]) << "%  " << std::flush;
+      for (uint32_t capacity = 1<<14; capacity < 1<<25; capacity = capacity << 1) {
+        uint32_t inserts = static_cast<uint32_t>(test_ratios[j]*(capacity));
+        std::cout << capacity << std::flush;
+        UnorderedMapTest<Device, Near> test(capacity, inserts*collisions[i], collisions[i]);
+        Device::fence();
+        test.print(metrics_out, length_out, distance_out, block_distance_out);
+      }
+      std::cout << "\b\b  " <<  std::endl;
+
+    }
+    std::cout << "  " << wall_clock.seconds() << " secs" << std::endl;
+  }
+  metrics_out.close();
+  length_out.close();
+  distance_out.close();
+  block_distance_out.close();
+#else
+  (void)base_file_name;
+  std::cout << "skipping test" << std::endl;
+#endif
+}
+
+
+} // namespace Perf
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_PERFORMANCE_HPP
diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..b51b1c2b26560bc67a6e5e421242436cc0d435ce
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp
@@ -0,0 +1,437 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITSET_HPP
+#define KOKKOS_BITSET_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Functional.hpp>
+
+#include <impl/Kokkos_Bitset_impl.hpp>
+
+#include <stdexcept>
+
+namespace Kokkos {
+
+template <typename Device = Kokkos::DefaultExecutionSpace >
+class Bitset;
+
+template <typename Device = Kokkos::DefaultExecutionSpace >
+class ConstBitset;
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src);
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+
+/// A thread safe view to a bitset
+template <typename Device>
+class Bitset
+{
+public:
+  typedef Device execution_space;
+  typedef unsigned size_type;
+
+  enum { BIT_SCAN_REVERSE = 1u };
+  enum { MOVE_HINT_BACKWARD = 2u };
+
+  enum {
+      BIT_SCAN_FORWARD_MOVE_HINT_FORWARD = 0u
+    , BIT_SCAN_REVERSE_MOVE_HINT_FORWARD = BIT_SCAN_REVERSE
+    , BIT_SCAN_FORWARD_MOVE_HINT_BACKWARD = MOVE_HINT_BACKWARD
+    , BIT_SCAN_REVERSE_MOVE_HINT_BACKWARD = BIT_SCAN_REVERSE | MOVE_HINT_BACKWARD
+  };
+
+private:
+  enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
+  enum { block_mask = block_size-1u };
+  enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) };
+
+public:
+
+
+  /// constructor
+  /// arg_size := number of bit in set
+  Bitset(unsigned arg_size = 0u)
+    : m_size(arg_size)
+    , m_last_block_mask(0u)
+    , m_blocks("Bitset", ((m_size + block_mask) >> block_shift) )
+  {
+    for (int i=0, end = static_cast<int>(m_size & block_mask); i < end; ++i) {
+      m_last_block_mask |= 1u << i;
+    }
+  }
+
+  /// assignment
+  Bitset<Device> & operator = (Bitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_last_block_mask = rhs.m_last_block_mask;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+  /// copy constructor
+  Bitset( Bitset<Device> const & rhs)
+    : m_size( rhs.m_size )
+    , m_last_block_mask( rhs.m_last_block_mask )
+    , m_blocks( rhs.m_blocks )
+  {}
+
+  /// number of bits in the set
+  /// can be call from the host or the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned size() const
+  { return m_size; }
+
+  /// number of bits which are set to 1
+  /// can only be called from the host
+  unsigned count() const
+  {
+    Impl::BitsetCount< Bitset<Device> > f(*this);
+    return f.apply();
+  }
+
+  /// set all bits to 1
+  /// can only be called from the host
+  void set()
+  {
+    Kokkos::deep_copy(m_blocks, ~0u );
+
+    if (m_last_block_mask) {
+      //clear the unused bits in the last block
+      typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+      raw_deep_copy( m_blocks.ptr_on_device() + (m_blocks.dimension_0() -1u), &m_last_block_mask, sizeof(unsigned));
+    }
+  }
+
+  /// set all bits to 0
+  /// can only be called from the host
+  void reset()
+  {
+    Kokkos::deep_copy(m_blocks, 0u );
+  }
+
+  /// set all bits to 0
+  /// can only be called from the host
+  void clear()
+  {
+    Kokkos::deep_copy(m_blocks, 0u );
+  }
+
+  /// set i'th bit to 1
+  /// can only be called from the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool set( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      unsigned * block_ptr = &m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+
+      return !( atomic_fetch_or( block_ptr, mask ) & mask );
+    }
+    return false;
+  }
+
+  /// set i'th bit to 0
+  /// can only be called from the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool reset( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      unsigned * block_ptr = &m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+
+      return atomic_fetch_and( block_ptr, ~mask ) & mask;
+    }
+    return false;
+  }
+
+  /// return true if the i'th bit set to 1
+  /// can only be called from the device
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool test( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      const unsigned block = volatile_load(&m_blocks[ i >> block_shift ]);
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+      return block & mask;
+    }
+    return false;
+  }
+
+  /// used with find_any_set_near or find_any_unset_near functions
+  /// returns the max number of times those functions should be call
+  /// when searching for an available bit
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned max_hint() const
+  {
+    return m_blocks.dimension_0();
+  }
+
+  /// find a bit set to 1 near the hint
+  /// returns a pair< bool, unsigned> where if result.first is true then result.second is the bit found
+  /// and if result.first is false the result.second is a new hint
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_set_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const
+  {
+    const unsigned block_idx = (hint >> block_shift) < m_blocks.dimension_0() ? (hint >> block_shift) : 0;
+    const unsigned offset = hint & block_mask;
+    unsigned block = volatile_load(&m_blocks[ block_idx ]);
+    block = !m_last_block_mask || (block_idx < (m_blocks.dimension_0()-1)) ? block : block & m_last_block_mask ;
+
+    return find_any_helper(block_idx, offset, block, scan_direction);
+  }
+
+  /// find a bit set to 0 near the hint
+  /// returns a pair< bool, unsigned> where if result.first is true then result.second is the bit found
+  /// and if result.first is false the result.second is a new hint
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_unset_near( unsigned hint , unsigned scan_direction = BIT_SCAN_FORWARD_MOVE_HINT_FORWARD ) const
+  {
+    const unsigned block_idx = hint >> block_shift;
+    const unsigned offset = hint & block_mask;
+    unsigned block = volatile_load(&m_blocks[ block_idx ]);
+    block = !m_last_block_mask || (block_idx < (m_blocks.dimension_0()-1) ) ? ~block : ~block & m_last_block_mask ;
+
+    return find_any_helper(block_idx, offset, block, scan_direction);
+  }
+
+private:
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  Kokkos::pair<bool, unsigned> find_any_helper(unsigned block_idx, unsigned offset, unsigned block, unsigned scan_direction) const
+  {
+    Kokkos::pair<bool, unsigned> result( block > 0u, 0);
+
+    if (!result.first) {
+      result.second = update_hint( block_idx, offset, scan_direction );
+    }
+    else {
+      result.second = scan_block(  (block_idx << block_shift)
+                                 , offset
+                                 , block
+                                 , scan_direction
+                                );
+    }
+    return result;
+  }
+
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned scan_block(unsigned block_start, int offset, unsigned block, unsigned scan_direction ) const
+  {
+    offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask;
+    block = Impl::rotate_right(block, offset);
+    return ((( !(scan_direction & BIT_SCAN_REVERSE) ?
+               Impl::bit_scan_forward(block) :
+               Impl::bit_scan_reverse(block)
+             ) + offset
+            ) & block_mask
+           ) + block_start;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned update_hint( long long block_idx, unsigned offset, unsigned scan_direction ) const
+  {
+    block_idx += scan_direction & MOVE_HINT_BACKWARD ? -1 : 1;
+    block_idx = block_idx >= 0 ? block_idx : m_blocks.dimension_0() - 1;
+    block_idx = block_idx < static_cast<long long>(m_blocks.dimension_0()) ? block_idx : 0;
+
+    return static_cast<unsigned>(block_idx)*block_size + offset;
+  }
+
+private:
+
+  unsigned m_size;
+  unsigned m_last_block_mask;
+  View< unsigned *, execution_space, MemoryTraits<RandomAccess> > m_blocks;
+
+private:
+  template <typename DDevice>
+  friend class Bitset;
+
+  template <typename DDevice>
+  friend class ConstBitset;
+
+  template <typename Bitset>
+  friend struct Impl::BitsetCount;
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src);
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+};
+
+/// a thread-safe view to a const bitset
+/// i.e. can only test bits
+template <typename Device>
+class ConstBitset
+{
+public:
+  typedef Device execution_space;
+  typedef unsigned size_type;
+
+private:
+  enum { block_size = static_cast<unsigned>(sizeof(unsigned)*CHAR_BIT) };
+  enum { block_mask = block_size -1u };
+  enum { block_shift = static_cast<int>(Impl::power_of_two<block_size>::value) };
+
+public:
+  ConstBitset()
+    : m_size (0)
+  {}
+
+  ConstBitset(Bitset<Device> const& rhs)
+    : m_size(rhs.m_size)
+    , m_blocks(rhs.m_blocks)
+  {}
+
+  ConstBitset(ConstBitset<Device> const& rhs)
+    : m_size( rhs.m_size )
+    , m_blocks( rhs.m_blocks )
+  {}
+
+  ConstBitset<Device> & operator = (Bitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+  ConstBitset<Device> & operator = (ConstBitset<Device> const & rhs)
+  {
+    this->m_size = rhs.m_size;
+    this->m_blocks = rhs.m_blocks;
+
+    return *this;
+  }
+
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  unsigned size() const
+  {
+    return m_size;
+  }
+
+  unsigned count() const
+  {
+    Impl::BitsetCount< ConstBitset<Device> > f(*this);
+    return f.apply();
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool test( unsigned i ) const
+  {
+    if ( i < m_size ) {
+      const unsigned block = m_blocks[ i >> block_shift ];
+      const unsigned mask = 1u << static_cast<int>( i & block_mask );
+      return block & mask;
+    }
+    return false;
+  }
+
+private:
+
+  unsigned m_size;
+  View< const unsigned *, execution_space, MemoryTraits<RandomAccess> > m_blocks;
+
+private:
+  template <typename DDevice>
+  friend class ConstBitset;
+
+  template <typename Bitset>
+  friend struct Impl::BitsetCount;
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+
+  template <typename DstDevice, typename SrcDevice>
+  friend void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src);
+};
+
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, Bitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.dimension_0());
+}
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( Bitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.dimension_0());
+}
+
+template <typename DstDevice, typename SrcDevice>
+void deep_copy( ConstBitset<DstDevice> & dst, ConstBitset<SrcDevice> const& src)
+{
+  if (dst.size() != src.size()) {
+    throw std::runtime_error("Error: Cannot deep_copy bitsets of different sizes!");
+  }
+
+  typedef Kokkos::Impl::DeepCopy< typename DstDevice::memory_space, typename SrcDevice::memory_space > raw_deep_copy;
+  raw_deep_copy(dst.m_blocks.ptr_on_device(), src.m_blocks.ptr_on_device(), sizeof(unsigned)*src.m_blocks.dimension_0());
+}
+
+} // namespace Kokkos
+
+#endif //KOKKOS_BITSET_HPP
diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..95eea57e9258cee18b4dbb0b9084d843739da88f
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp
@@ -0,0 +1,840 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_DualView.hpp
+/// \brief Declaration and definition of Kokkos::DualView.
+///
+/// This header file declares and defines Kokkos::DualView and its
+/// related nonmember functions.
+
+#ifndef KOKKOS_DUALVIEW_HPP
+#define KOKKOS_DUALVIEW_HPP
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+namespace Kokkos {
+
+/* \class DualView
+ * \brief Container to manage mirroring a Kokkos::View that lives
+ *   in device memory with a Kokkos::View that lives in host memory.
+ *
+ * This class provides capabilities to manage data which exists in two
+ * memory spaces at the same time.  It keeps views of the same layout
+ * on two memory spaces as well as modified flags for both
+ * allocations.  Users are responsible for setting the modified flags
+ * manually if they change the data in either memory space, by calling
+ * the sync() method templated on the device where they modified the
+ * data.  Users may synchronize data by calling the modify() function,
+ * templated on the device towards which they want to synchronize
+ * (i.e., the target of the one-way copy operation).
+ *
+ * The DualView class also provides convenience methods such as
+ * realloc, resize and capacity which call the appropriate methods of
+ * the underlying Kokkos::View objects.
+ *
+ * The four template arguments are the same as those of Kokkos::View.
+ * (Please refer to that class' documentation for a detailed
+ * description.)
+ *
+ *   \tparam DataType The type of the entries stored in the container.
+ *
+ *   \tparam Layout The array's layout in memory.
+ *
+ *   \tparam Device The Kokkos Device type.  If its memory space is
+ *     not the same as the host's memory space, then DualView will
+ *     contain two separate Views: one in device memory, and one in
+ *     host memory.  Otherwise, DualView will only store one View.
+ *
+ *   \tparam MemoryTraits (optional) The user's intended memory access
+ *     behavior.  Please see the documentation of Kokkos::View for
+ *     examples.  The default suffices for most users.
+ */
+template< class DataType ,
+          class Arg1Type = void ,
+          class Arg2Type = void ,
+          class Arg3Type = void>
+class DualView : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
+{
+public:
+  //! \name Typedefs for device types and various Kokkos::View specializations.
+  //@{
+  typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
+
+  //! The Kokkos Host Device type;
+  typedef typename traits::host_mirror_space host_mirror_space ;
+
+  //! The type of a Kokkos::View on the device.
+  typedef View< typename traits::data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > t_dev ;
+
+  /// \typedef t_host
+  /// \brief The type of a Kokkos::View host mirror of \c t_dev.
+  typedef typename t_dev::HostMirror t_host ;
+
+  //! The type of a const View on the device.
+  //! The type of a Kokkos::View on the device.
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > t_dev_const ;
+
+  /// \typedef t_host_const
+  /// \brief The type of a const View host mirror of \c t_dev_const.
+  typedef typename t_dev_const::HostMirror t_host_const;
+
+  //! The type of a const, random-access View on the device.
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                MemoryRandomAccess > t_dev_const_randomread ;
+
+  /// \typedef t_host_const_randomread
+  /// \brief The type of a const, random-access View host mirror of
+  ///   \c t_dev_const_randomread.
+  typedef typename t_dev_const_randomread::HostMirror t_host_const_randomread;
+
+  //! The type of an unmanaged View on the device.
+  typedef View< typename traits::data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                MemoryUnmanaged> t_dev_um;
+
+  //! The type of an unmanaged View host mirror of \c t_dev_um.
+  typedef View< typename t_host::data_type ,
+                typename t_host::array_layout ,
+                typename t_host::device_type ,
+                MemoryUnmanaged> t_host_um;
+
+  //! The type of a const unmanaged View on the device.
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                MemoryUnmanaged> t_dev_const_um;
+
+  //! The type of a const unmanaged View host mirror of \c t_dev_const_um.
+  typedef View<typename t_host::const_data_type,
+               typename t_host::array_layout,
+               typename t_host::device_type,
+               MemoryUnmanaged> t_host_const_um;
+
+  //@}
+  //! \name The two View instances.
+  //@{
+
+  t_dev d_view;
+  t_host h_view;
+
+  //@}
+  //! \name Counters to keep track of changes ("modified" flags)
+  //@{
+
+  View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_device;
+  View<unsigned int,LayoutLeft,typename t_host::execution_space> modified_host;
+
+  //@}
+  //! \name Constructors
+  //@{
+
+  /// \brief Empty constructor.
+  ///
+  /// Both device and host View objects are constructed using their
+  /// default constructors.  The "modified" flags are both initialized
+  /// to "unmodified."
+  DualView () :
+    modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
+    modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
+  {}
+
+  /// \brief Constructor that allocates View objects on both host and device.
+  ///
+  /// This constructor works like the analogous constructor of View.
+  /// The first argument is a string label, which is entirely for your
+  /// benefit.  (Different DualView objects may have the same label if
+  /// you like.)  The arguments that follow are the dimensions of the
+  /// View objects.  For example, if the View has three dimensions,
+  /// the first three integer arguments will be nonzero, and you may
+  /// omit the integer arguments that follow.
+  DualView (const std::string& label,
+            const size_t n0 = 0,
+            const size_t n1 = 0,
+            const size_t n2 = 0,
+            const size_t n3 = 0,
+            const size_t n4 = 0,
+            const size_t n5 = 0,
+            const size_t n6 = 0,
+            const size_t n7 = 0)
+    : d_view (label, n0, n1, n2, n3, n4, n5, n6, n7)
+    , h_view (create_mirror_view (d_view)) // without UVM, host View mirrors
+    , modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device"))
+    , modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
+  {}
+
+  //! Copy constructor (shallow copy)
+  template<class SS, class LS, class DS, class MS>
+  DualView (const DualView<SS,LS,DS,MS>& src) :
+    d_view (src.d_view),
+    h_view (src.h_view),
+    modified_device (src.modified_device),
+    modified_host (src.modified_host)
+  {}
+
+  /// \brief Create DualView from existing device and host View objects.
+  ///
+  /// This constructor assumes that the device and host View objects
+  /// are synchronized.  You, the caller, are responsible for making
+  /// sure this is the case before calling this constructor.  After
+  /// this constructor returns, you may use DualView's sync() and
+  /// modify() methods to ensure synchronization of the View objects.
+  ///
+  /// \param d_view_ Device View
+  /// \param h_view_ Host View (must have type t_host = t_dev::HostMirror)
+  DualView (const t_dev& d_view_, const t_host& h_view_) :
+    d_view (d_view_),
+    h_view (h_view_),
+    modified_device (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_device")),
+    modified_host (View<unsigned int,LayoutLeft,typename t_host::execution_space> ("DualView::modified_host"))
+  {
+    Impl::assert_shapes_are_equal (d_view.shape (), h_view.shape ());
+  }
+
+  //@}
+  //! \name Methods for synchronizing, marking as modified, and getting Views.
+  //@{
+
+  /// \brief Return a View on a specific device \c Device.
+  ///
+  /// Please don't be afraid of the if_c expression in the return
+  /// value's type.  That just tells the method what the return type
+  /// should be: t_dev if the \c Device template parameter matches
+  /// this DualView's device type, else t_host.
+  ///
+  /// For example, suppose you create a DualView on Cuda, like this:
+  /// \code
+  /// typedef Kokkos::DualView<float, Kokkos::LayoutRight, Kokkos::Cuda> dual_view_type;
+  /// dual_view_type DV ("my dual view", 100);
+  /// \endcode
+  /// If you want to get the CUDA device View, do this:
+  /// \code
+  /// typename dual_view_type::t_dev cudaView = DV.view<Kokkos::Cuda> ();
+  /// \endcode
+  /// and if you want to get the host mirror of that View, do this:
+  /// \code
+  /// typedef typename Kokkos::HostSpace::execution_space host_device_type;
+  /// typename dual_view_type::t_host hostView = DV.view<host_device_type> ();
+  /// \endcode
+  template< class Device >
+  KOKKOS_INLINE_FUNCTION
+  const typename Impl::if_c<
+    Impl::is_same<typename t_dev::memory_space,
+                          typename Device::memory_space>::value,
+    t_dev,
+    t_host>::type& view () const
+  {
+    return Impl::if_c<
+      Impl::is_same<
+        typename t_dev::memory_space,
+        typename Device::memory_space>::value,
+      t_dev,
+      t_host >::select (d_view , h_view);
+  }
+
+  /// \brief Update data on device or host only if data in the other
+  ///   space has been marked as modified.
+  ///
+  /// If \c Device is the same as this DualView's device type, then
+  /// copy data from host to device.  Otherwise, copy data from device
+  /// to host.  In either case, only copy if the source of the copy
+  /// has been modified.
+  ///
+  /// This is a one-way synchronization only.  If the target of the
+  /// copy has been modified, this operation will discard those
+  /// modifications.  It will also reset both device and host modified
+  /// flags.
+  ///
+  /// \note This method doesn't know on its own whether you modified
+  ///   the data in either View.  You must manually mark modified data
+  ///   as modified, by calling the modify() method with the
+  ///   appropriate template parameter.
+  template<class Device>
+  void sync( const typename Impl::enable_if<
+        ( Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value) ||
+        ( Impl::is_same< Device , int>::value)
+        , int >::type& = 0)
+  {
+    const unsigned int dev =
+      Impl::if_c<
+        Impl::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value ,
+        unsigned int,
+        unsigned int>::select (1, 0);
+
+    if (dev) { // if Device is the same as DualView's device type
+      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+        deep_copy (d_view, h_view);
+        modified_host() = modified_device() = 0;
+      }
+    } else { // hopefully Device is the same as DualView's host type
+      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+        deep_copy (h_view, d_view);
+        modified_host() = modified_device() = 0;
+      }
+    }
+    if(Impl::is_same<typename t_host::memory_space,typename t_dev::memory_space>::value) {
+      t_dev::execution_space::fence();
+      t_host::execution_space::fence();
+    }
+  }
+
+  template<class Device>
+  void sync ( const typename Impl::enable_if<
+      ( ! Impl::is_same< typename traits::data_type , typename traits::non_const_data_type>::value ) ||
+      ( Impl::is_same< Device , int>::value)
+      , int >::type& = 0 )
+  {
+    const unsigned int dev =
+      Impl::if_c<
+        Impl::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value,
+        unsigned int,
+        unsigned int>::select (1, 0);
+    if (dev) { // if Device is the same as DualView's device type
+      if ((modified_host () > 0) && (modified_host () >= modified_device ())) {
+        Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
+      }
+    } else { // hopefully Device is the same as DualView's host type
+      if ((modified_device () > 0) && (modified_device () >= modified_host ())) {
+        Impl::throw_runtime_exception("Calling sync on a DualView with a const datatype.");
+      }
+    }
+  }
+  /// \brief Mark data as modified on the given device \c Device.
+  ///
+  /// If \c Device is the same as this DualView's device type, then
+  /// mark the device's data as modified.  Otherwise, mark the host's
+  /// data as modified.
+  template<class Device>
+  void modify () {
+    const unsigned int dev =
+      Impl::if_c<
+        Impl::is_same<
+          typename t_dev::memory_space,
+          typename Device::memory_space>::value,
+        unsigned int,
+        unsigned int>::select (1, 0);
+
+    if (dev) { // if Device is the same as DualView's device type
+      // Increment the device's modified count.
+      modified_device () = (modified_device () > modified_host () ?
+                            modified_device () : modified_host ()) + 1;
+    } else { // hopefully Device is the same as DualView's host type
+      // Increment the host's modified count.
+      modified_host () = (modified_device () > modified_host () ?
+                          modified_device () : modified_host ())  + 1;
+    }
+  }
+
+  //@}
+  //! \name Methods for reallocating or resizing the View objects.
+  //@{
+
+  /// \brief Reallocate both View objects.
+  ///
+  /// This discards any existing contents of the objects, and resets
+  /// their modified flags.  It does <i>not</i> copy the old contents
+  /// of either View into the new View objects.
+  void realloc( const size_t n0 = 0 ,
+           const size_t n1 = 0 ,
+           const size_t n2 = 0 ,
+           const size_t n3 = 0 ,
+           const size_t n4 = 0 ,
+           const size_t n5 = 0 ,
+           const size_t n6 = 0 ,
+           const size_t n7 = 0 ) {
+    ::Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+     h_view = create_mirror_view( d_view );
+
+     /* Reset dirty flags */
+     modified_device() = modified_host() = 0;
+  }
+
+  /// \brief Resize both views, copying old contents into new if necessary.
+  ///
+  /// This method only copies the old contents into the new View
+  /// objects for the device which was last marked as modified.
+  void resize( const size_t n0 = 0 ,
+           const size_t n1 = 0 ,
+           const size_t n2 = 0 ,
+           const size_t n3 = 0 ,
+           const size_t n4 = 0 ,
+           const size_t n5 = 0 ,
+           const size_t n6 = 0 ,
+           const size_t n7 = 0 ) {
+   if(modified_device() >= modified_host()) {
+     /* Resize on Device */
+     ::Kokkos::resize(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+     h_view = create_mirror_view( d_view );
+
+     /* Mark Device copy as modified */
+     modified_device() = modified_device()+1;
+
+   } else {
+     /* Realloc on Device */
+
+     ::Kokkos::realloc(d_view,n0,n1,n2,n3,n4,n5,n6,n7);
+     t_host temp_view = create_mirror_view( d_view );
+
+     /* Remap on Host */
+     Kokkos::deep_copy( temp_view , h_view );
+
+     h_view = temp_view;
+
+     /* Mark Host copy as modified */
+     modified_host() = modified_host()+1;
+   }
+  }
+
+  //@}
+  //! \name Methods for getting capacity, stride, or dimension(s).
+  //@{
+
+  //! The allocation size (same as Kokkos::View::capacity).
+  size_t capacity() const {
+#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+    return d_view.span();
+#else
+    return d_view.capacity();
+#endif
+  }
+
+  //! Get stride(s) for each dimension.
+  template< typename iType>
+  void stride(iType* stride_) const {
+    d_view.stride(stride_);
+  }
+
+  /* \brief return size of dimension 0 */
+  size_t dimension_0() const {return d_view.dimension_0();}
+  /* \brief return size of dimension 1 */
+  size_t dimension_1() const {return d_view.dimension_1();}
+  /* \brief return size of dimension 2 */
+  size_t dimension_2() const {return d_view.dimension_2();}
+  /* \brief return size of dimension 3 */
+  size_t dimension_3() const {return d_view.dimension_3();}
+  /* \brief return size of dimension 4 */
+  size_t dimension_4() const {return d_view.dimension_4();}
+  /* \brief return size of dimension 5 */
+  size_t dimension_5() const {return d_view.dimension_5();}
+  /* \brief return size of dimension 6 */
+  size_t dimension_6() const {return d_view.dimension_6();}
+  /* \brief return size of dimension 7 */
+  size_t dimension_7() const {return d_view.dimension_7();}
+
+  //@}
+};
+
+} // namespace Kokkos
+//
+// Partial specializations of Kokkos::subview() for DualView objects.
+//
+
+namespace Kokkos {
+namespace Impl {
+
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type
+        >
+struct ViewSubview< DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type  >
+                  , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                  , SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type >
+{
+private:
+
+  typedef DualView< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type >  SrcViewType ;
+
+  enum { V0 = Impl::is_same< SubArg0_type , void >::value ? 1 : 0 };
+  enum { V1 = Impl::is_same< SubArg1_type , void >::value ? 1 : 0 };
+  enum { V2 = Impl::is_same< SubArg2_type , void >::value ? 1 : 0 };
+  enum { V3 = Impl::is_same< SubArg3_type , void >::value ? 1 : 0 };
+  enum { V4 = Impl::is_same< SubArg4_type , void >::value ? 1 : 0 };
+  enum { V5 = Impl::is_same< SubArg5_type , void >::value ? 1 : 0 };
+  enum { V6 = Impl::is_same< SubArg6_type , void >::value ? 1 : 0 };
+  enum { V7 = Impl::is_same< SubArg7_type , void >::value ? 1 : 0 };
+
+  // The source view rank must be equal to the input argument rank
+  // Once a void argument is encountered all subsequent arguments must be void.
+  enum { InputRank =
+    Impl::StaticAssert<( SrcViewType::rank ==
+                         ( V0 ? 0 : (
+                           V1 ? 1 : (
+                           V2 ? 2 : (
+                           V3 ? 3 : (
+                           V4 ? 4 : (
+                           V5 ? 5 : (
+                           V6 ? 6 : (
+                           V7 ? 7 : 8 ))))))) ))
+                       &&
+                       ( SrcViewType::rank ==
+                         ( 8 - ( V0 + V1 + V2 + V3 + V4 + V5 + V6 + V7 ) ) )
+    >::value ? SrcViewType::rank : 0 };
+
+  enum { R0 = Impl::ViewOffsetRange< SubArg0_type >::is_range ? 1 : 0 };
+  enum { R1 = Impl::ViewOffsetRange< SubArg1_type >::is_range ? 1 : 0 };
+  enum { R2 = Impl::ViewOffsetRange< SubArg2_type >::is_range ? 1 : 0 };
+  enum { R3 = Impl::ViewOffsetRange< SubArg3_type >::is_range ? 1 : 0 };
+  enum { R4 = Impl::ViewOffsetRange< SubArg4_type >::is_range ? 1 : 0 };
+  enum { R5 = Impl::ViewOffsetRange< SubArg5_type >::is_range ? 1 : 0 };
+  enum { R6 = Impl::ViewOffsetRange< SubArg6_type >::is_range ? 1 : 0 };
+  enum { R7 = Impl::ViewOffsetRange< SubArg7_type >::is_range ? 1 : 0 };
+
+  enum { OutputRank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+                    + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
+
+  // Reverse
+  enum { R0_rev = 0 == InputRank ? 0u : (
+                  1 == InputRank ? unsigned(R0) : (
+                  2 == InputRank ? unsigned(R1) : (
+                  3 == InputRank ? unsigned(R2) : (
+                  4 == InputRank ? unsigned(R3) : (
+                  5 == InputRank ? unsigned(R4) : (
+                  6 == InputRank ? unsigned(R5) : (
+                  7 == InputRank ? unsigned(R6) : unsigned(R7) ))))))) };
+
+  typedef typename SrcViewType::array_layout  SrcViewLayout ;
+
+  // Choose array layout, attempting to preserve original layout if at all possible.
+  typedef typename Impl::if_c<
+     ( // Same Layout IF
+       // OutputRank 0
+       ( OutputRank == 0 )
+       ||
+       // OutputRank 1 or 2, InputLayout Left, Interval 0
+       // because single stride one or second index has a stride.
+       ( OutputRank <= 2 && R0 && Impl::is_same<SrcViewLayout,LayoutLeft>::value )
+       ||
+       // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
+       // because single stride one or second index has a stride.
+       ( OutputRank <= 2 && R0_rev && Impl::is_same<SrcViewLayout,LayoutRight>::value )
+     ), SrcViewLayout , Kokkos::LayoutStride >::type OutputViewLayout ;
+
+  // Choose data type as a purely dynamic rank array to accomodate a runtime range.
+  typedef typename Impl::if_c< OutputRank == 0 , typename SrcViewType::value_type ,
+          typename Impl::if_c< OutputRank == 1 , typename SrcViewType::value_type *,
+          typename Impl::if_c< OutputRank == 2 , typename SrcViewType::value_type **,
+          typename Impl::if_c< OutputRank == 3 , typename SrcViewType::value_type ***,
+          typename Impl::if_c< OutputRank == 4 , typename SrcViewType::value_type ****,
+          typename Impl::if_c< OutputRank == 5 , typename SrcViewType::value_type *****,
+          typename Impl::if_c< OutputRank == 6 , typename SrcViewType::value_type ******,
+          typename Impl::if_c< OutputRank == 7 , typename SrcViewType::value_type *******,
+                                                 typename SrcViewType::value_type ********
+  >::type >::type >::type >::type >::type >::type >::type >::type  OutputData ;
+
+  // Choose space.
+  // If the source view's template arg1 or arg2 is a space then use it,
+  // otherwise use the source view's execution space.
+
+  typedef typename Impl::if_c< Impl::is_space< SrcArg1Type >::value , SrcArg1Type ,
+          typename Impl::if_c< Impl::is_space< SrcArg2Type >::value , SrcArg2Type , typename SrcViewType::execution_space
+  >::type >::type OutputSpace ;
+
+public:
+
+  // If keeping the layout then match non-data type arguments
+  // else keep execution space and memory traits.
+  typedef typename
+    Impl::if_c< Impl::is_same< SrcViewLayout , OutputViewLayout >::value
+              , Kokkos::DualView< OutputData , SrcArg1Type , SrcArg2Type , SrcArg3Type >
+              , Kokkos::DualView< OutputData , OutputViewLayout , OutputSpace
+                            , typename SrcViewType::memory_traits >
+              >::type  type ;
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , void , void , void
+                          , void , void , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , void , void , void
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0);
+  sub_view.h_view = subview(src.h_view,arg0);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , void , void
+                          , void , void , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , void , void
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1);
+  sub_view.h_view = subview(src.h_view,arg0,arg1);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , void
+                          , void , void , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , void
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , void , void , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , void , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , void , void ,void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , ArgType5 , void , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , ArgType5 , void , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 , class ArgType6 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , ArgType5 , ArgType6 , void
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 ,
+         const ArgType6 & arg6 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , ArgType5 , ArgType6 , void
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+template< class D , class A1 , class A2 , class A3 ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 >
+typename Impl::ViewSubview< DualView<D,A1,A2,A3>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , ArgType5 , ArgType6 , ArgType7
+                          >::type
+subview( const DualView<D,A1,A2,A3> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 ,
+         const ArgType6 & arg6 ,
+         const ArgType7 & arg7 )
+{
+  typedef typename
+    Impl::ViewSubview< DualView<D,A1,A2,A3>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , ArgType5 , ArgType6 , ArgType7
+                 >::type
+      DstViewType ;
+  DstViewType sub_view;
+  sub_view.d_view = subview(src.d_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
+  sub_view.h_view = subview(src.h_view,arg0,arg1,arg2,arg3,arg4,arg5,arg6,arg7);
+  sub_view.modified_device = src.modified_device;
+  sub_view.modified_host = src.modified_host;
+  return sub_view;
+}
+
+//
+// Partial specialization of Kokkos::deep_copy() for DualView objects.
+//
+
+template< class DT , class DL , class DD , class DM ,
+          class ST , class SL , class SD , class SM >
+void
+deep_copy (DualView<DT,DL,DD,DM> dst, // trust me, this must not be a reference
+           const DualView<ST,SL,SD,SM>& src )
+{
+  if (src.modified_device () >= src.modified_host ()) {
+    deep_copy (dst.d_view, src.d_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::device_type> ();
+  } else {
+    deep_copy (dst.h_view, src.h_view);
+    dst.template modify<typename DualView<DT,DL,DD,DM>::host_mirror_space> ();
+  }
+}
+
+} // namespace Kokkos
+
+#endif
diff --git a/lib/kokkos/containers/src/Kokkos_Functional.hpp b/lib/kokkos/containers/src/Kokkos_Functional.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..5c7350ef1cd3bb1ed68deff0c823ce3f7a5a3619
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_Functional.hpp
@@ -0,0 +1,173 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_FUNCTIONAL_HPP
+#define KOKKOS_FUNCTIONAL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_Functional_impl.hpp>
+
+namespace Kokkos {
+
+// These should work for most types
+
+template <typename T>
+struct pod_hash
+{
+  typedef T argument_type;
+  typedef T first_argument_type;
+  typedef uint32_t second_argument_type;
+  typedef uint32_t result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t operator()(T const & t) const
+  { return Impl::MurmurHash3_x86_32( &t, sizeof(T), 0); }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t operator()(T const & t, uint32_t seed) const
+  { return Impl::MurmurHash3_x86_32( &t, sizeof(T), seed); }
+};
+
+template <typename T>
+struct pod_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return Impl::bitwise_equal(&a,&b); }
+};
+
+template <typename T>
+struct pod_not_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return !Impl::bitwise_equal(&a,&b); }
+};
+
+template <typename T>
+struct equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a == b; }
+};
+
+template <typename T>
+struct not_equal_to
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a != b; }
+};
+
+
+template <typename T>
+struct greater
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a > b; }
+};
+
+
+template <typename T>
+struct less
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a < b; }
+};
+
+template <typename T>
+struct greater_equal
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a >= b; }
+};
+
+
+template <typename T>
+struct less_equal
+{
+  typedef T first_argument_type;
+  typedef T second_argument_type;
+  typedef bool result_type;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool operator()(T const & a, T const & b) const
+  { return a <= b; }
+};
+
+} // namespace Kokkos
+
+
+#endif //KOKKOS_FUNCTIONAL_HPP
+
+
diff --git a/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp b/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..3f328ba9563f01421c93dda8e8eeafbc2d679968
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_SegmentedView.hpp
@@ -0,0 +1,531 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SEGMENTED_VIEW_HPP_
+#define KOKKOS_SEGMENTED_VIEW_HPP_
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <cstdio>
+
+#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+namespace Kokkos {
+namespace Experimental {
+
+namespace Impl {
+
+template<class DataType, class Arg1Type, class Arg2Type, class Arg3Type>
+struct delete_segmented_view;
+
+template<class MemorySpace>
+inline
+void DeviceSetAllocatableMemorySize(size_t) {}
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+template<>
+inline
+void DeviceSetAllocatableMemorySize<Kokkos::CudaSpace>(size_t size) {
+#ifdef __CUDACC__
+  size_t size_limit;
+  cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
+  if(size_limit<size)
+    cudaDeviceSetLimit(cudaLimitMallocHeapSize,2*size);
+  cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
+#endif
+}
+
+template<>
+inline
+void DeviceSetAllocatableMemorySize<Kokkos::CudaUVMSpace>(size_t size) {
+#ifdef __CUDACC__
+  size_t size_limit;
+  cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
+  if(size_limit<size)
+    cudaDeviceSetLimit(cudaLimitMallocHeapSize,2*size);
+  cudaDeviceGetLimit(&size_limit,cudaLimitMallocHeapSize);
+#endif
+}
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+
+}
+
+template< class DataType ,
+          class Arg1Type = void ,
+          class Arg2Type = void ,
+          class Arg3Type = void>
+class SegmentedView : public Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
+{
+public:
+  //! \name Typedefs for device types and various Kokkos::View specializations.
+  //@{
+  typedef Kokkos::ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
+
+  //! The type of a Kokkos::View on the device.
+  typedef Kokkos::View< typename traits::data_type ,
+                typename traits::array_layout ,
+                typename traits::memory_space ,
+                Kokkos::MemoryUnmanaged > t_dev ;
+
+
+private:
+  Kokkos::View<t_dev*,typename traits::memory_space> segments_;
+
+  Kokkos::View<int,typename traits::memory_space> realloc_lock;
+  Kokkos::View<int,typename traits::memory_space> nsegments_;
+
+  size_t segment_length_;
+  size_t segment_length_m1_;
+  int max_segments_;
+
+  int segment_length_log2;
+
+  // Dimensions, cardinality, capacity, and offset computation for
+  // multidimensional array view of contiguous memory.
+  // Inherits from Impl::Shape
+  typedef Kokkos::Impl::ViewOffset< typename traits::shape_type
+                          , typename traits::array_layout
+                          > offset_map_type ;
+
+  offset_map_type               m_offset_map ;
+
+  typedef Kokkos::View< typename traits::array_intrinsic_type ,
+                typename traits::array_layout ,
+                typename traits::memory_space ,
+                typename traits::memory_traits > array_type ;
+
+  typedef Kokkos::View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::memory_space ,
+                typename traits::memory_traits > const_type ;
+
+  typedef Kokkos::View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::memory_space ,
+                typename traits::memory_traits > non_const_type ;
+
+  typedef Kokkos::View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                HostSpace ,
+                void > HostMirror ;
+
+  template< bool Accessible >
+  KOKKOS_INLINE_FUNCTION
+  typename Kokkos::Impl::enable_if< Accessible , typename traits::size_type >::type
+  dimension_0_intern() const { return nsegments_() * segment_length_ ; }
+
+  template< bool Accessible >
+  KOKKOS_INLINE_FUNCTION
+  typename Kokkos::Impl::enable_if< ! Accessible , typename traits::size_type >::type
+  dimension_0_intern() const
+  {
+    // In Host space
+    int n = 0 ;
+#if ! defined( __CUDA_ARCH__ )
+    Kokkos::Impl::DeepCopy< HostSpace , typename traits::memory_space >( & n , nsegments_.ptr_on_device() , sizeof(int) );
+#endif
+
+    return n * segment_length_ ;
+  }
+
+public:
+
+  enum { Rank = traits::rank };
+
+  KOKKOS_INLINE_FUNCTION offset_map_type shape() const { return m_offset_map ; }
+
+  /* \brief return (current) size of dimension 0 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const {
+    enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
+      Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value };
+    int n = SegmentedView::dimension_0_intern< Accessible >();
+    return n ;
+  }
+
+  /* \brief return size of dimension 1 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
+  /* \brief return size of dimension 2 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
+  /* \brief return size of dimension 3 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
+  /* \brief return size of dimension 4 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
+  /* \brief return size of dimension 5 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
+  /* \brief return size of dimension 6 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
+  /* \brief return size of dimension 7 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
+
+  /* \brief return size of dimension 2 */
+  KOKKOS_INLINE_FUNCTION typename traits::size_type size() const {
+    return dimension_0() *
+        m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 *
+        m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7 ;
+  }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type dimension( const iType & i ) const {
+    if(i==0)
+      return dimension_0();
+    else
+      return Kokkos::Impl::dimension( m_offset_map , i );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type capacity() {
+    return segments_.dimension_0() *
+        m_offset_map.N1 * m_offset_map.N2 * m_offset_map.N3 * m_offset_map.N4 *
+        m_offset_map.N5 * m_offset_map.N6 * m_offset_map.N7;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type get_num_segments() {
+    enum { Accessible = Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
+      Kokkos::Impl::ActiveExecutionMemorySpace, typename traits::memory_space >::value };
+    int n = SegmentedView::dimension_0_intern< Accessible >();
+    return n/segment_length_ ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type get_max_segments() {
+    return max_segments_;
+  }
+
+  /// \brief Constructor that allocates View objects with an initial length of 0.
+  ///
+  /// This constructor works mostly like the analogous constructor of View.
+  /// The first argument is a string label, which is entirely for your
+  /// benefit.  (Different SegmentedView objects may have the same label if
+  /// you like.)  The second argument 'view_length' is the size of the segments.
+  /// This number must be a power of two. The third argument n0 is the maximum
+  /// value for the first dimension of the segmented view. The maximal allocatable
+  /// number of Segments is thus: (n0+view_length-1)/view_length.
+  /// The arguments that follow are the other dimensions of the (1-7) of the
+  /// View objects.  For example, for a View with 3 runtime dimensions,
+  /// the first 4 integer arguments will be nonzero:
+  /// SegmentedView("Name",32768,10000000,8,4). This allocates a SegmentedView
+  /// with a maximum of 306 segments of dimension (32768,8,4). The logical size of
+  /// the segmented view is (n,8,4) with n between 0 and 10000000.
+  /// You may omit the integer arguments that follow.
+  template< class LabelType >
+  SegmentedView(const LabelType & label ,
+      const size_t view_length ,
+      const size_t n0 ,
+      const size_t n1 = 0 ,
+      const size_t n2 = 0 ,
+      const size_t n3 = 0 ,
+      const size_t n4 = 0 ,
+      const size_t n5 = 0 ,
+      const size_t n6 = 0 ,
+      const size_t n7 = 0
+      ): segment_length_(view_length),segment_length_m1_(view_length-1)
+  {
+    segment_length_log2 = -1;
+    size_t l = segment_length_;
+    while(l>0) {
+      l>>=1;
+      segment_length_log2++;
+    }
+    l = 1<<segment_length_log2;
+    if(l!=segment_length_)
+      Kokkos::Impl::throw_runtime_exception("Kokkos::SegmentedView requires a 'power of 2' segment length");
+
+    max_segments_ = (n0+segment_length_m1_)/segment_length_;
+
+    Impl::DeviceSetAllocatableMemorySize<typename traits::memory_space>(segment_length_*max_segments_*sizeof(typename traits::value_type));
+
+    segments_ = Kokkos::View<t_dev*,typename traits::execution_space>(label , max_segments_);
+    realloc_lock = Kokkos::View<int,typename traits::execution_space>("Lock");
+    nsegments_ = Kokkos::View<int,typename traits::execution_space>("nviews");
+    m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n0*n1*n2*n3*n4*n5*n6*n7 );
+
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SegmentedView(const SegmentedView& src):
+    segments_(src.segments_),
+    realloc_lock (src.realloc_lock),
+    nsegments_ (src.nsegments_),
+    segment_length_(src.segment_length_),
+    segment_length_m1_(src.segment_length_m1_),
+    max_segments_ (src.max_segments_),
+    segment_length_log2(src.segment_length_log2),
+    m_offset_map (src.m_offset_map)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  SegmentedView& operator= (const SegmentedView& src) {
+    segments_ = src.segments_;
+    realloc_lock = src.realloc_lock;
+    nsegments_ = src.nsegments_;
+    segment_length_= src.segment_length_;
+    segment_length_m1_= src.segment_length_m1_;
+    max_segments_ = src.max_segments_;
+    segment_length_log2= src.segment_length_log2;
+    m_offset_map = src.m_offset_map;
+    return *this;
+  }
+
+  ~SegmentedView() {
+    if ( !segments_.tracker().ref_counting()) { return; }
+    size_t ref_count = segments_.tracker().ref_count();
+    if(ref_count == 1u) {
+      Kokkos::fence();
+      typename Kokkos::View<int,typename traits::execution_space>::HostMirror h_nviews("h_nviews");
+      Kokkos::deep_copy(h_nviews,nsegments_);
+      Kokkos::parallel_for(h_nviews(),Impl::delete_segmented_view<DataType , Arg1Type , Arg2Type, Arg3Type>(*this));
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  t_dev get_segment(const int& i) const {
+    return segments_[i];
+  }
+
+  template< class MemberType>
+  KOKKOS_INLINE_FUNCTION
+  void grow (MemberType& team_member, const size_t& growSize) const {
+    if (growSize>max_segments_*segment_length_) {
+      printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_);
+      return;
+    }
+
+    if(team_member.team_rank()==0) {
+      bool too_small = growSize > segment_length_ * nsegments_();
+      if (too_small) {
+        while(Kokkos::atomic_compare_exchange(&realloc_lock(),0,1) )
+          ; // get the lock
+        too_small = growSize > segment_length_ * nsegments_(); // Recheck once we have the lock
+        if(too_small) {
+          while(too_small) {
+            const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3*
+                m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7;
+            typename traits::non_const_value_type* const ptr = new typename traits::non_const_value_type[alloc_size];
+
+            segments_(nsegments_()) =
+                t_dev(ptr,segment_length_,m_offset_map.N1,m_offset_map.N2,m_offset_map.N3,m_offset_map.N4,m_offset_map.N5,m_offset_map.N6,m_offset_map.N7);
+            nsegments_()++;
+            too_small = growSize > segment_length_ * nsegments_();
+          }
+        }
+        realloc_lock() = 0; //release the lock
+      }
+    }
+    team_member.team_barrier();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void grow_non_thread_safe (const size_t& growSize) const {
+    if (growSize>max_segments_*segment_length_) {
+      printf ("Exceeding maxSize: %lu %lu\n", growSize, max_segments_*segment_length_);
+      return;
+    }
+    bool too_small = growSize > segment_length_ * nsegments_();
+    if(too_small) {
+      while(too_small) {
+        const size_t alloc_size = segment_length_*m_offset_map.N1*m_offset_map.N2*m_offset_map.N3*
+                            m_offset_map.N4*m_offset_map.N5*m_offset_map.N6*m_offset_map.N7;
+        typename traits::non_const_value_type* const ptr =
+          new typename traits::non_const_value_type[alloc_size];
+
+        segments_(nsegments_()) =
+          t_dev (ptr, segment_length_, m_offset_map.N1, m_offset_map.N2,
+                 m_offset_map.N3, m_offset_map.N4, m_offset_map.N5,
+                 m_offset_map.N6, m_offset_map.N7);
+        nsegments_()++;
+        too_small = growSize > segment_length_ * nsegments_();
+      }
+    }
+  }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value && traits::rank == 1 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_));
+    }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            traits::rank == 2 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            traits::rank == 3 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            std::is_integral<iType3>::value &&
+                            traits::rank == 4 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            std::is_integral<iType3>::value &&
+                            std::is_integral<iType4>::value &&
+                            traits::rank == 5 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            std::is_integral<iType3>::value &&
+                            std::is_integral<iType4>::value &&
+                            std::is_integral<iType5>::value &&
+                            traits::rank == 6 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            std::is_integral<iType3>::value &&
+                            std::is_integral<iType4>::value &&
+                            std::is_integral<iType5>::value &&
+                            std::is_integral<iType6>::value &&
+                            traits::rank == 7 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6);
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( std::is_integral<iType0>::value &&
+                            std::is_integral<iType1>::value &&
+                            std::is_integral<iType2>::value &&
+                            std::is_integral<iType3>::value &&
+                            std::is_integral<iType4>::value &&
+                            std::is_integral<iType5>::value &&
+                            std::is_integral<iType6>::value &&
+                            std::is_integral<iType7>::value &&
+                            traits::rank == 8 )
+                         , typename traits::value_type &
+                         >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
+    {
+      return segments_[i0>>segment_length_log2](i0&(segment_length_m1_),i1,i2,i3,i4,i5,i6,i7);
+    }
+};
+
+namespace Impl {
+template<class DataType, class Arg1Type, class Arg2Type, class Arg3Type>
+struct delete_segmented_view {
+  typedef SegmentedView<DataType , Arg1Type , Arg2Type, Arg3Type> view_type;
+  typedef typename view_type::execution_space execution_space;
+
+  view_type view_;
+  delete_segmented_view(view_type view):view_(view) {
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i) const {
+    delete [] view_.get_segment(i).ptr_on_device();
+  }
+};
+
+}
+}
+}
+
+#endif
+
+#endif
diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..1ce38638a2b6a107d1439f7feebb0c90c4a8068f
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp
@@ -0,0 +1,226 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STATICCRSGRAPH_HPP
+#define KOKKOS_STATICCRSGRAPH_HPP
+
+#include <string>
+#include <vector>
+
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+
+/// \class StaticCrsGraph
+/// \brief Compressed row storage array.
+///
+/// \tparam DataType The type of stored entries.  If a StaticCrsGraph is
+///   used as the graph of a sparse matrix, then this is usually an
+///   integer type, the type of the column indices in the sparse
+///   matrix.
+///
+/// \tparam Arg1Type The second template parameter, corresponding
+///   either to the Device type (if there are no more template
+///   parameters) or to the Layout type (if there is at least one more
+///   template parameter).
+///
+/// \tparam Arg2Type The third template parameter, which if provided
+///   corresponds to the Device type.
+///
+/// \tparam SizeType The type of row offsets.  Usually the default
+///   parameter suffices.  However, setting a nondefault value is
+///   necessary in some cases, for example, if you want to have a
+///   sparse matrices with dimensions (and therefore column indices)
+///   that fit in \c int, but want to store more than <tt>INT_MAX</tt>
+///   entries in the sparse matrix.
+///
+/// A row has a range of entries:
+/// <ul>
+/// <li> <tt> row_map[i0] <= entry < row_map[i0+1] </tt> </li>
+/// <li> <tt> 0 <= i1 < row_map[i0+1] - row_map[i0] </tt> </li>
+/// <li> <tt> entries( entry ,            i2 , i3 , ... ); </tt> </li>
+/// <li> <tt> entries( row_map[i0] + i1 , i2 , i3 , ... ); </tt> </li>
+/// </ul>
+template< class DataType,
+          class Arg1Type,
+          class Arg2Type = void,
+          typename SizeType = typename ViewTraits<DataType*, Arg1Type, Arg2Type, void >::size_type>
+class StaticCrsGraph {
+private:
+  typedef ViewTraits<DataType*, Arg1Type, Arg2Type, void> traits;
+
+public:
+  typedef DataType                                            data_type;
+  typedef typename traits::array_layout                       array_layout;
+  typedef typename traits::execution_space                    execution_space;
+  typedef typename traits::device_type                        device_type;
+  typedef SizeType                                            size_type;
+
+  typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > staticcrsgraph_type;
+  typedef StaticCrsGraph< DataType , array_layout , typename traits::host_mirror_space , SizeType > HostMirror;
+  typedef View< const size_type* , array_layout, device_type >  row_map_type;
+  typedef View<       DataType*  , array_layout, device_type >  entries_type;
+
+  entries_type entries;
+  row_map_type row_map;
+
+  //! Construct an empty view.
+  StaticCrsGraph () : entries(), row_map() {}
+
+  //! Copy constructor (shallow copy).
+  StaticCrsGraph (const StaticCrsGraph& rhs) : entries (rhs.entries), row_map (rhs.row_map)
+  {}
+
+  template<class EntriesType, class RowMapType>
+  StaticCrsGraph (const EntriesType& entries_,const RowMapType& row_map_) : entries (entries_), row_map (row_map_)
+  {}
+
+  /** \brief  Assign to a view of the rhs array.
+   *          If the old view is the last view
+   *          then allocated memory is deallocated.
+   */
+  StaticCrsGraph& operator= (const StaticCrsGraph& rhs) {
+    entries = rhs.entries;
+    row_map = rhs.row_map;
+    return *this;
+  }
+
+  /**  \brief  Destroy this view of the array.
+   *           If the last view then allocated memory is deallocated.
+   */
+  ~StaticCrsGraph() {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type numRows() const {
+    return (row_map.dimension_0 () != 0) ?
+      row_map.dimension_0 () - static_cast<size_type> (1) :
+      static_cast<size_type> (0);
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class StaticCrsGraphType , class InputSizeType >
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< InputSizeType > & input );
+
+template< class StaticCrsGraphType , class InputSizeType >
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input );
+
+//----------------------------------------------------------------------------
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          typename SizeType >
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & input );
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_StaticCrsGraph_factory.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class GraphType >
+struct StaticCrsGraphMaximumEntry {
+
+  typedef typename GraphType::execution_space execution_space ;
+  typedef typename GraphType::data_type value_type ;
+
+  const typename GraphType::entries_type entries ;
+
+  StaticCrsGraphMaximumEntry( const GraphType & graph ) : entries( graph.entries ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned i , value_type & update ) const
+    { if ( update < entries(i) ) update = entries(i); }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const
+    { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & update ,
+             volatile const value_type & input ) const
+    { if ( update < input ) update = input ; }
+};
+
+}
+
+template< class DataType, class Arg1Type, class Arg2Type, typename SizeType >
+DataType maximum_entry( const StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType > & graph )
+{
+  typedef StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType> GraphType ;
+  typedef Impl::StaticCrsGraphMaximumEntry< GraphType > FunctorType ;
+
+  DataType result = 0 ;
+  Kokkos::parallel_reduce( graph.entries.dimension_0(),
+                           FunctorType(graph), result );
+  return result ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CRSARRAY_HPP */
+
diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..7a916c6ef7c449a041d6d2014033e34c3342f185
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp
@@ -0,0 +1,848 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_UnorderedMap.hpp
+/// \brief Declaration and definition of Kokkos::UnorderedMap.
+///
+/// This header file declares and defines Kokkos::UnorderedMap and its
+/// related nonmember functions.
+
+#ifndef KOKKOS_UNORDERED_MAP_HPP
+#define KOKKOS_UNORDERED_MAP_HPP
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Functional.hpp>
+
+#include <Kokkos_Bitset.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_UnorderedMap_impl.hpp>
+
+
+#include <iostream>
+
+#include <stdint.h>
+#include <stdexcept>
+
+
+namespace Kokkos {
+
+enum { UnorderedMapInvalidIndex = ~0u };
+
+/// \brief First element of the return value of UnorderedMap::insert().
+///
+/// Inserting an element into an UnorderedMap is not guaranteed to
+/// succeed.  There are three possible conditions:
+/// <ol>
+/// <li> <tt>INSERT_FAILED</tt>: The insert failed.  This usually
+///      means that the UnorderedMap ran out of space. </li>
+/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
+///      did <i>not</i> exist in the table before. </li>
+/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
+///      <i>did</i> exist in the table before.  The new value was
+///      ignored and the old value was left in place. </li>
+/// </ol>
+
+class UnorderedMapInsertResult
+{
+private:
+  enum Status{
+     SUCCESS = 1u << 31
+   , EXISTING = 1u << 30
+   , FREED_EXISTING = 1u << 29
+   , LIST_LENGTH_MASK = ~(SUCCESS | EXISTING | FREED_EXISTING)
+  };
+
+public:
+  /// Did the map successful insert the key/value pair
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool success() const { return (m_status & SUCCESS); }
+
+  /// Was the key already present in the map
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool existing() const { return (m_status & EXISTING); }
+
+  /// Did the map fail to insert the key due to insufficent capacity
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool failed() const { return m_index == UnorderedMapInvalidIndex; }
+
+  /// Did the map lose a race condition to insert a dupulicate key/value pair
+  /// where an index was claimed that needed to be released
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool freed_existing() const { return (m_status & FREED_EXISTING); }
+
+  /// How many iterations through the insert loop did it take before the
+  /// map returned
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t list_position() const { return (m_status & LIST_LENGTH_MASK); }
+
+  /// Index where the key can be found as long as the insert did not fail
+  KOKKOS_FORCEINLINE_FUNCTION
+  uint32_t index() const { return m_index; }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  UnorderedMapInsertResult()
+    : m_index(UnorderedMapInvalidIndex)
+    , m_status(0)
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void increment_list_position()
+  {
+    m_status += (list_position() < LIST_LENGTH_MASK) ? 1u : 0u;
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void set_existing(uint32_t i, bool arg_freed_existing)
+  {
+    m_index = i;
+    m_status = EXISTING | (arg_freed_existing ? FREED_EXISTING : 0u) | list_position();
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void set_success(uint32_t i)
+  {
+    m_index = i;
+    m_status = SUCCESS | list_position();
+  }
+
+private:
+  uint32_t m_index;
+  uint32_t m_status;
+};
+
+/// \class UnorderedMap
+/// \brief Thread-safe, performance-portable lookup table.
+///
+/// This class provides a lookup table.  In terms of functionality,
+/// this class compares to std::unordered_map (new in C++11).
+/// "Unordered" means that keys are not stored in any particular
+/// order, unlike (for example) std::map.  "Thread-safe" means that
+/// lookups, insertion, and deletion are safe to call by multiple
+/// threads in parallel.  "Performance-portable" means that parallel
+/// performance of these operations is reasonable, on multiple
+/// hardware platforms.  Platforms on which performance has been
+/// tested include conventional Intel x86 multicore processors, Intel
+/// Xeon Phi ("MIC"), and NVIDIA GPUs.
+///
+/// Parallel performance portability entails design decisions that
+/// might differ from one's expectation for a sequential interface.
+/// This particularly affects insertion of single elements.  In an
+/// interface intended for sequential use, insertion might reallocate
+/// memory if the original allocation did not suffice to hold the new
+/// element.  In this class, insertion does <i>not</i> reallocate
+/// memory.  This means that it might fail.  insert() returns an enum
+/// which indicates whether the insert failed.  There are three
+/// possible conditions:
+/// <ol>
+/// <li> <tt>INSERT_FAILED</tt>: The insert failed.  This usually
+///      means that the UnorderedMap ran out of space. </li>
+/// <li> <tt>INSERT_SUCCESS</tt>: The insert succeeded, and the key
+///      did <i>not</i> exist in the table before. </li>
+/// <li> <tt>INSERT_EXISTING</tt>: The insert succeeded, and the key
+///      <i>did</i> exist in the table before.  The new value was
+///      ignored and the old value was left in place. </li>
+/// </ol>
+///
+/// \tparam Key Type of keys of the lookup table.  If \c const, users
+///   are not allowed to add or remove keys, though they are allowed
+///   to change values.  In that case, the implementation may make
+///   optimizations specific to the <tt>Device</tt>.  For example, if
+///   <tt>Device</tt> is \c Cuda, it may use texture fetches to access
+///   keys.
+///
+/// \tparam Value Type of values stored in the lookup table.  You may use
+///   \c void here, in which case the table will be a set of keys.  If
+///   \c const, users are not allowed to change entries.
+///   In that case, the implementation may make
+///   optimizations specific to the \c Device, such as using texture
+///   fetches to access values.
+///
+/// \tparam Device The Kokkos Device type.
+///
+/// \tparam Hasher Definition of the hash function for instances of
+///   <tt>Key</tt>.  The default will calculate a bitwise hash.
+///
+/// \tparam EqualTo Definition of the equality function for instances of
+///   <tt>Key</tt>.  The default will do a bitwise equality comparison.
+///
+template <   typename Key
+           , typename Value
+           , typename Device = Kokkos::DefaultExecutionSpace
+           , typename Hasher = pod_hash<typename Impl::remove_const<Key>::type>
+           , typename EqualTo = pod_equal_to<typename Impl::remove_const<Key>::type>
+        >
+class UnorderedMap
+{
+private:
+  typedef typename ViewTraits<Key,Device,void,void>::host_mirror_space host_mirror_space ;
+public:
+  //! \name Public types and constants
+  //@{
+
+  //key_types
+  typedef Key declared_key_type;
+  typedef typename Impl::remove_const<declared_key_type>::type key_type;
+  typedef typename Impl::add_const<key_type>::type const_key_type;
+
+  //value_types
+  typedef Value declared_value_type;
+  typedef typename Impl::remove_const<declared_value_type>::type value_type;
+  typedef typename Impl::add_const<value_type>::type const_value_type;
+
+  typedef Device execution_space;
+  typedef Hasher hasher_type;
+  typedef EqualTo  equal_to_type;
+  typedef uint32_t size_type;
+
+  //map_types
+  typedef UnorderedMap<declared_key_type,declared_value_type,execution_space,hasher_type,equal_to_type> declared_map_type;
+  typedef UnorderedMap<key_type,value_type,execution_space,hasher_type,equal_to_type>                   insertable_map_type;
+  typedef UnorderedMap<const_key_type,value_type,execution_space,hasher_type,equal_to_type>             modifiable_map_type;
+  typedef UnorderedMap<const_key_type,const_value_type,execution_space,hasher_type,equal_to_type>       const_map_type;
+
+  static const bool is_set = Impl::is_same<void,value_type>::value;
+  static const bool has_const_key = Impl::is_same<const_key_type,declared_key_type>::value;
+  static const bool has_const_value = is_set || Impl::is_same<const_value_type,declared_value_type>::value;
+
+  static const bool is_insertable_map = !has_const_key && (is_set || !has_const_value);
+  static const bool is_modifiable_map = has_const_key && !has_const_value;
+  static const bool is_const_map = has_const_key && has_const_value;
+
+
+  typedef UnorderedMapInsertResult insert_result;
+
+  typedef UnorderedMap<Key,Value,host_mirror_space,Hasher,EqualTo> HostMirror;
+
+  typedef Impl::UnorderedMapHistogram<const_map_type> histogram_type;
+
+  //@}
+
+private:
+  enum { invalid_index = ~static_cast<size_type>(0) };
+
+  typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , View< key_type *, execution_space>
+                               , View< const key_type *, execution_space, MemoryTraits<RandomAccess> >
+                             >::type key_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map || is_modifiable_map
+                               , View< impl_value_type *, execution_space>
+                               , View< const impl_value_type *, execution_space, MemoryTraits<RandomAccess> >
+                             >::type value_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , View< size_type *, execution_space>
+                               , View< const size_type *, execution_space, MemoryTraits<RandomAccess> >
+                             >::type size_type_view;
+
+  typedef typename Impl::if_c<   is_insertable_map
+                               , Bitset< execution_space >
+                               , ConstBitset< execution_space>
+                             >::type bitset_type;
+
+  enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
+  enum { num_scalars = 3 };
+  typedef View< int[num_scalars], LayoutLeft, execution_space> scalars_view;
+
+public:
+  //! \name Public member functions
+  //@{
+
+  UnorderedMap()
+    : m_bounded_insert()
+    , m_hasher()
+    , m_equal_to()
+    , m_size()
+    , m_available_indexes()
+    , m_hash_lists()
+    , m_next_index()
+    , m_keys()
+    , m_values()
+    , m_scalars()
+  {}
+
+  /// \brief Constructor
+  ///
+  /// \param capacity_hint [in] Initial guess of how many unique keys will be inserted into the map
+  /// \param hash [in] Hasher function for \c Key instances.  The
+  ///   default value usually suffices.
+  UnorderedMap(  size_type capacity_hint, hasher_type hasher = hasher_type(), equal_to_type equal_to = equal_to_type() )
+    : m_bounded_insert(true)
+    , m_hasher(hasher)
+    , m_equal_to(equal_to)
+    , m_size()
+    , m_available_indexes(calculate_capacity(capacity_hint))
+    , m_hash_lists(ViewAllocateWithoutInitializing("UnorderedMap hash list"), Impl::find_hash_size(capacity()))
+    , m_next_index(ViewAllocateWithoutInitializing("UnorderedMap next index"), capacity()+1) // +1 so that the *_at functions can always return a valid reference
+    , m_keys("UnorderedMap keys",capacity()+1)
+    , m_values("UnorderedMap values",(is_set? 1 : capacity()+1))
+    , m_scalars("UnorderedMap scalars")
+  {
+    if (!is_insertable_map) {
+      throw std::runtime_error("Cannot construct a non-insertable (i.e. const key_type) unordered_map");
+    }
+
+    Kokkos::deep_copy(m_hash_lists, invalid_index);
+    Kokkos::deep_copy(m_next_index, invalid_index);
+  }
+
+  void reset_failed_insert_flag()
+  {
+    reset_flag(failed_insert_idx);
+  }
+
+  histogram_type get_histogram()
+  {
+    return histogram_type(*this);
+  }
+
+  //! Clear all entries in the table.
+  void clear()
+  {
+    m_bounded_insert = true;
+
+    if (capacity() == 0) return;
+
+    m_available_indexes.clear();
+
+    Kokkos::deep_copy(m_hash_lists, invalid_index);
+    Kokkos::deep_copy(m_next_index, invalid_index);
+    {
+      const key_type tmp = key_type();
+      Kokkos::deep_copy(m_keys,tmp);
+    }
+    if (is_set){
+      const impl_value_type tmp = impl_value_type();
+      Kokkos::deep_copy(m_values,tmp);
+    }
+    {
+      Kokkos::deep_copy(m_scalars, 0);
+    }
+  }
+
+  /// \brief Change the capacity of the the map
+  ///
+  /// If there are no failed inserts the current size of the map will
+  /// be used as a lower bound for the input capacity.
+  /// If the map is not empty and does not have failed inserts
+  /// and the capacity changes then the current data is copied
+  /// into the resized / rehashed map.
+  ///
+  /// This is <i>not</i> a device function; it may <i>not</i> be
+  /// called in a parallel kernel.
+  bool rehash(size_type requested_capacity = 0)
+  {
+    const bool bounded_insert = (capacity() == 0) || (size() == 0u);
+    return rehash(requested_capacity, bounded_insert );
+  }
+
+  bool rehash(size_type requested_capacity, bool bounded_insert)
+  {
+    if(!is_insertable_map) return false;
+
+    const size_type curr_size = size();
+    requested_capacity = (requested_capacity < curr_size) ? curr_size : requested_capacity;
+
+    insertable_map_type tmp(requested_capacity, m_hasher, m_equal_to);
+
+    if (curr_size) {
+      tmp.m_bounded_insert = false;
+      Impl::UnorderedMapRehash<insertable_map_type> f(tmp,*this);
+      f.apply();
+    }
+    tmp.m_bounded_insert = bounded_insert;
+
+    *this = tmp;
+
+    return true;
+  }
+
+  /// \brief The number of entries in the table.
+  ///
+  /// This method has undefined behavior when erasable() is true.
+  ///
+  /// Note that this is not a device function; it cannot be called in
+  /// a parallel kernel.  The value is not stored as a variable; it
+  /// must be computed.
+  size_type size() const
+  {
+    if( capacity() == 0u ) return 0u;
+    if (modified()) {
+      m_size = m_available_indexes.count();
+      reset_flag(modified_idx);
+    }
+    return m_size;
+  }
+
+  /// \brief The current number of failed insert() calls.
+  ///
+  /// This is <i>not</i> a device function; it may <i>not</i> be
+  /// called in a parallel kernel.  The value is not stored as a
+  /// variable; it must be computed.
+  bool failed_insert() const
+  {
+    return get_flag(failed_insert_idx);
+  }
+
+  bool erasable() const
+  {
+    return is_insertable_map ? get_flag(erasable_idx) : false;
+  }
+
+  bool begin_erase()
+  {
+    bool result = !erasable();
+    if (is_insertable_map && result) {
+      execution_space::fence();
+      set_flag(erasable_idx);
+      execution_space::fence();
+    }
+    return result;
+  }
+
+  bool end_erase()
+  {
+    bool result = erasable();
+    if (is_insertable_map && result) {
+      execution_space::fence();
+      Impl::UnorderedMapErase<declared_map_type> f(*this);
+      f.apply();
+      execution_space::fence();
+      reset_flag(erasable_idx);
+    }
+    return result;
+  }
+
+  /// \brief The maximum number of entries that the table can hold.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type capacity() const
+  { return m_available_indexes.size(); }
+
+  /// \brief The number of hash table "buckets."
+  ///
+  /// This is different than the number of entries that the table can
+  /// hold.  Each key hashes to an index in [0, hash_capacity() - 1].
+  /// That index can hold zero or more entries.  This class decides
+  /// what hash_capacity() should be, given the user's upper bound on
+  /// the number of entries the table must be able to hold.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  size_type hash_capacity() const
+  { return m_hash_lists.dimension_0(); }
+
+  //---------------------------------------------------------------------------
+  //---------------------------------------------------------------------------
+
+
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.  As discussed in the class documentation, it need not
+  /// succeed.  The return value tells you if it did.
+  ///
+  /// \param k [in] The key to attempt to insert.
+  /// \param v [in] The corresponding value to attempt to insert.  If
+  ///   using this class as a set (with Value = void), then you need not
+  ///   provide this value.
+  KOKKOS_INLINE_FUNCTION
+  insert_result insert(key_type const& k, impl_value_type const&v = impl_value_type()) const
+  {
+    insert_result result;
+
+    if ( !is_insertable_map || capacity() == 0u || m_scalars((int)erasable_idx) ) {
+      return result;
+    }
+
+    if ( !m_scalars((int)modified_idx) ) {
+      m_scalars((int)modified_idx) = true;
+    }
+
+    int volatile & failed_insert_ref = m_scalars((int)failed_insert_idx) ;
+
+    const size_type hash_value = m_hasher(k);
+    const size_type hash_list = hash_value % m_hash_lists.dimension_0();
+
+    size_type * curr_ptr   = & m_hash_lists[ hash_list ];
+    size_type new_index    = invalid_index ;
+
+    // Force integer multiply to long
+    size_type index_hint = static_cast<size_type>( (static_cast<double>(hash_list) * capacity()) / m_hash_lists.dimension_0());
+
+    size_type find_attempts = 0;
+
+    enum { bounded_find_attempts = 32u };
+    const size_type max_attempts = (m_bounded_insert && (bounded_find_attempts < m_available_indexes.max_hint()) ) ?
+                                    bounded_find_attempts :
+                                    m_available_indexes.max_hint();
+
+    bool not_done = true ;
+
+#if defined( __MIC__ )
+      #pragma noprefetch
+#endif
+    while ( not_done ) {
+
+      // Continue searching the unordered list for this key,
+      // list will only be appended during insert phase.
+      // Need volatile_load as other threads may be appending.
+      size_type curr = volatile_load(curr_ptr);
+
+      KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+#if defined( __MIC__ )
+      #pragma noprefetch
+#endif
+      while ( curr != invalid_index && ! m_equal_to( volatile_load(&m_keys[curr]), k) ) {
+        result.increment_list_position();
+        index_hint = curr;
+        curr_ptr = &m_next_index[curr];
+        curr = volatile_load(curr_ptr);
+        KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+      }
+
+      //------------------------------------------------------------
+      // If key already present then return that index.
+      if ( curr != invalid_index ) {
+
+        const bool free_existing = new_index != invalid_index;
+        if ( free_existing ) {
+          // Previously claimed an unused entry that was not inserted.
+          // Release this unused entry immediately.
+          if (!m_available_indexes.reset(new_index) ) {
+            printf("Unable to free existing\n");
+          }
+
+        }
+
+        result.set_existing(curr, free_existing);
+        not_done = false ;
+      }
+      //------------------------------------------------------------
+      // Key is not currently in the map.
+      // If the thread has claimed an entry try to insert now.
+      else {
+
+        //------------------------------------------------------------
+        // If have not already claimed an unused entry then do so now.
+        if (new_index == invalid_index) {
+
+          bool found = false;
+          // use the hash_list as the flag for the search direction
+          Kokkos::tie(found, index_hint) = m_available_indexes.find_any_unset_near( index_hint, hash_list );
+
+          // found and index and this thread set it
+          if ( !found && ++find_attempts >= max_attempts ) {
+            failed_insert_ref = true;
+            not_done = false ;
+          }
+          else if (m_available_indexes.set(index_hint) ) {
+            new_index = index_hint;
+            // Set key and value
+            KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_keys[new_index]);
+            m_keys[new_index] = k ;
+
+            if (!is_set) {
+              KOKKOS_NONTEMPORAL_PREFETCH_STORE(&m_values[new_index]);
+              m_values[new_index] = v ;
+            }
+
+            // Do not proceed until key and value are updated in global memory
+            memory_fence();
+          }
+        }
+        else if (failed_insert_ref) {
+          not_done = false;
+        }
+
+        // Attempt to append claimed entry into the list.
+        // Another thread may also be trying to append the same list so protect with atomic.
+        if ( new_index != invalid_index &&
+             curr ==  atomic_compare_exchange(curr_ptr, static_cast<size_type>(invalid_index), new_index) ) {
+          // Succeeded in appending
+          result.set_success(new_index);
+          not_done = false ;
+        }
+      }
+    } // while ( not_done )
+
+    return result ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool erase(key_type const& k) const
+  {
+    bool result = false;
+
+    if(is_insertable_map && 0u < capacity() && m_scalars((int)erasable_idx)) {
+
+      if ( ! m_scalars((int)modified_idx) ) {
+        m_scalars((int)modified_idx) = true;
+      }
+
+      size_type index = find(k);
+      if (valid_at(index)) {
+        m_available_indexes.reset(index);
+        result = true;
+      }
+    }
+
+    return result;
+  }
+
+  /// \brief Find the given key \c k, if it exists in the table.
+  ///
+  /// \return If the key exists in the table, the index of the
+  ///   value corresponding to that key; otherwise, an invalid index.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  size_type find( const key_type & k) const
+  {
+    size_type curr = 0u < capacity() ? m_hash_lists( m_hasher(k) % m_hash_lists.dimension_0() ) : invalid_index ;
+
+    KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+    while (curr != invalid_index && !m_equal_to( m_keys[curr], k) ) {
+      KOKKOS_NONTEMPORAL_PREFETCH_LOAD(&m_keys[curr != invalid_index ? curr : 0]);
+      curr = m_next_index[curr];
+    }
+
+    return curr;
+  }
+
+  /// \brief Does the key exist in the map
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_INLINE_FUNCTION
+  bool exists( const key_type & k) const
+  {
+    return valid_at(find(k));
+  }
+
+
+  /// \brief Get the value with \c i as its direct index.
+  ///
+  /// \param i [in] Index directly into the array of entries.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  ///
+  /// 'const value_type' via Cuda texture fetch must return by value.
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::if_c< (is_set || has_const_value), impl_value_type, impl_value_type &>::type
+  value_at(size_type i) const
+  {
+    return m_values[ is_set ? 0 : (i < capacity() ? i : capacity()) ];
+  }
+
+  /// \brief Get the key with \c i as its direct index.
+  ///
+  /// \param i [in] Index directly into the array of entries.
+  ///
+  /// This <i>is</i> a device function; it may be called in a parallel
+  /// kernel.
+  KOKKOS_FORCEINLINE_FUNCTION
+  key_type key_at(size_type i) const
+  {
+    return m_keys[ i < capacity() ? i : capacity() ];
+  }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool valid_at(size_type i) const
+  {
+    return m_available_indexes.test(i);
+  }
+
+  template <typename SKey, typename SValue>
+  UnorderedMap( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src,
+                typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value,int>::type = 0
+              )
+    : m_bounded_insert(src.m_bounded_insert)
+    , m_hasher(src.m_hasher)
+    , m_equal_to(src.m_equal_to)
+    , m_size(src.m_size)
+    , m_available_indexes(src.m_available_indexes)
+    , m_hash_lists(src.m_hash_lists)
+    , m_next_index(src.m_next_index)
+    , m_keys(src.m_keys)
+    , m_values(src.m_values)
+    , m_scalars(src.m_scalars)
+  {}
+
+
+  template <typename SKey, typename SValue>
+  typename Impl::enable_if< Impl::UnorderedMapCanAssign<declared_key_type,declared_value_type,SKey,SValue>::value
+                           ,declared_map_type & >::type
+  operator=( UnorderedMap<SKey,SValue,Device,Hasher,EqualTo> const& src)
+  {
+    m_bounded_insert = src.m_bounded_insert;
+    m_hasher = src.m_hasher;
+    m_equal_to = src.m_equal_to;
+    m_size = src.m_size;
+    m_available_indexes = src.m_available_indexes;
+    m_hash_lists = src.m_hash_lists;
+    m_next_index = src.m_next_index;
+    m_keys = src.m_keys;
+    m_values = src.m_values;
+    m_scalars = src.m_scalars;
+    return *this;
+  }
+
+  template <typename SKey, typename SValue, typename SDevice>
+  typename Impl::enable_if< Impl::is_same< typename Impl::remove_const<SKey>::type, key_type>::value &&
+                            Impl::is_same< typename Impl::remove_const<SValue>::type, value_type>::value
+                          >::type
+  create_copy_view( UnorderedMap<SKey, SValue, SDevice, Hasher,EqualTo> const& src)
+  {
+    if (m_hash_lists.ptr_on_device() != src.m_hash_lists.ptr_on_device()) {
+
+      insertable_map_type tmp;
+
+      tmp.m_bounded_insert = src.m_bounded_insert;
+      tmp.m_hasher = src.m_hasher;
+      tmp.m_equal_to = src.m_equal_to;
+      tmp.m_size = src.size();
+      tmp.m_available_indexes = bitset_type( src.capacity() );
+      tmp.m_hash_lists        = size_type_view( ViewAllocateWithoutInitializing("UnorderedMap hash list"), src.m_hash_lists.dimension_0() );
+      tmp.m_next_index        = size_type_view( ViewAllocateWithoutInitializing("UnorderedMap next index"), src.m_next_index.dimension_0() );
+      tmp.m_keys              = key_type_view( ViewAllocateWithoutInitializing("UnorderedMap keys"), src.m_keys.dimension_0() );
+      tmp.m_values            = value_type_view( ViewAllocateWithoutInitializing("UnorderedMap values"), src.m_values.dimension_0() );
+      tmp.m_scalars           = scalars_view("UnorderedMap scalars");
+
+      Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);
+
+      typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, typename SDevice::memory_space > raw_deep_copy;
+
+      raw_deep_copy(tmp.m_hash_lists.ptr_on_device(), src.m_hash_lists.ptr_on_device(), sizeof(size_type)*src.m_hash_lists.dimension_0());
+      raw_deep_copy(tmp.m_next_index.ptr_on_device(), src.m_next_index.ptr_on_device(), sizeof(size_type)*src.m_next_index.dimension_0());
+      raw_deep_copy(tmp.m_keys.ptr_on_device(), src.m_keys.ptr_on_device(), sizeof(key_type)*src.m_keys.dimension_0());
+      if (!is_set) {
+        raw_deep_copy(tmp.m_values.ptr_on_device(), src.m_values.ptr_on_device(), sizeof(impl_value_type)*src.m_values.dimension_0());
+      }
+      raw_deep_copy(tmp.m_scalars.ptr_on_device(), src.m_scalars.ptr_on_device(), sizeof(int)*num_scalars );
+
+      *this = tmp;
+    }
+  }
+
+  //@}
+private: // private member functions
+
+  bool modified() const
+  {
+    return get_flag(modified_idx);
+  }
+
+  void set_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    const int true_ = true;
+    raw_deep_copy(m_scalars.ptr_on_device() + flag, &true_, sizeof(int));
+  }
+
+  void reset_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
+    const int false_ = false;
+    raw_deep_copy(m_scalars.ptr_on_device() + flag, &false_, sizeof(int));
+  }
+
+  bool get_flag(int flag) const
+  {
+    typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename execution_space::memory_space > raw_deep_copy;
+    int result = false;
+    raw_deep_copy(&result, m_scalars.ptr_on_device() + flag, sizeof(int));
+    return result;
+  }
+
+  static uint32_t calculate_capacity(uint32_t capacity_hint)
+  {
+    // increase by 16% and round to nears multiple of 128
+    return capacity_hint ? ((static_cast<uint32_t>(7ull*capacity_hint/6u) + 127u)/128u)*128u : 128u;
+  }
+
+private: // private members
+  bool              m_bounded_insert;
+  hasher_type       m_hasher;
+  equal_to_type     m_equal_to;
+  mutable size_type m_size;
+  bitset_type       m_available_indexes;
+  size_type_view    m_hash_lists;
+  size_type_view    m_next_index;
+  key_type_view     m_keys;
+  value_type_view   m_values;
+  scalars_view      m_scalars;
+
+  template <typename KKey, typename VValue, typename DDevice, typename HHash, typename EEqualTo>
+  friend class UnorderedMap;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapErase;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapHistogram;
+
+  template <typename UMap>
+  friend struct Impl::UnorderedMapPrint;
+};
+
+// Specialization of deep_copy for two UnorderedMap objects.
+template <  typename DKey, typename DT, typename DDevice
+          , typename SKey, typename ST, typename SDevice
+          , typename Hasher, typename EqualTo >
+inline void deep_copy(         UnorderedMap<DKey, DT, DDevice, Hasher, EqualTo> & dst
+                       , const UnorderedMap<SKey, ST, SDevice, Hasher, EqualTo> & src )
+{
+  dst.create_copy_view(src);
+}
+
+
+} // namespace Kokkos
+
+#endif //KOKKOS_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..db54b0c350ff18cc524066d52325fbca8d8701be
--- /dev/null
+++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp
@@ -0,0 +1,287 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VECTOR_HPP
+#define KOKKOS_VECTOR_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_DualView.hpp>
+
+/* Drop in replacement for std::vector based on Kokkos::DualView
+ * Most functions only work on the host (it will not compile if called from device kernel)
+ *
+ */
+  namespace Kokkos {
+
+template <typename Scalar, class Space = Kokkos::DefaultExecutionSpace >
+class vector : public DualView<Scalar*,LayoutLeft,Space> {
+public:
+  typedef typename Space::memory_space memory_space;
+  typedef typename Space::execution_space execution_space;
+  typedef typename Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef Scalar value_type;
+  typedef Scalar* pointer;
+  typedef const Scalar* const_pointer;
+  typedef Scalar* reference;
+  typedef const Scalar* const_reference;
+  typedef Scalar* iterator;
+  typedef const Scalar* const_iterator;
+
+private:
+  size_t _size;
+  typedef size_t size_type;
+  float _extra_storage;
+  typedef DualView<Scalar*,LayoutLeft,Space> DV;
+
+
+public:
+#ifdef KOKKOS_CUDA_USE_UVM
+  KOKKOS_INLINE_FUNCTION Scalar& operator() (int i) const {return DV::h_view(i);};
+  KOKKOS_INLINE_FUNCTION Scalar& operator[] (int i) const {return DV::h_view(i);};
+#else
+  inline Scalar& operator() (int i) const {return DV::h_view(i);};
+  inline Scalar& operator[] (int i) const {return DV::h_view(i);};
+#endif
+
+  /* Member functions which behave like std::vector functions */
+
+  vector():DV() {
+    _size = 0;
+    _extra_storage = 1.1;
+    DV::modified_host() = 1;
+  };
+
+
+  vector(int n, Scalar val=Scalar()):DualView<Scalar*,LayoutLeft,Space>("Vector",size_t(n*(1.1))) {
+    _size = n;
+    _extra_storage = 1.1;
+    DV::modified_host() = 1;
+
+    assign(n,val);
+  }
+
+
+  void resize(size_t n) {
+    if(n>=capacity())
+      DV::resize(size_t (n*_extra_storage));
+    _size = n;
+  }
+
+  void resize(size_t n, const Scalar& val) {
+    assign(n,val);
+  }
+
+  void assign (size_t n, const Scalar& val) {
+
+    /* Resize if necessary (behavour of std:vector) */
+
+    if(n>capacity())
+      DV::resize(size_t (n*_extra_storage));
+    _size = n;
+
+          /* Assign value either on host or on device */
+
+    if( DV::modified_host() >= DV::modified_device() ) {
+      set_functor_host f(DV::h_view,val);
+      parallel_for(n,f);
+      DV::t_host::execution_space::fence();
+      DV::modified_host()++;
+    } else {
+      set_functor f(DV::d_view,val);
+      parallel_for(n,f);
+      DV::t_dev::execution_space::fence();
+      DV::modified_device()++;
+    }
+  }
+
+  void reserve(size_t n) {
+    DV::resize(size_t (n*_extra_storage));
+  }
+
+  void push_back(Scalar val) {
+    DV::modified_host()++;
+    if(_size == capacity()) {
+      size_t new_size = _size*_extra_storage;
+      if(new_size == _size) new_size++;
+      DV::resize(new_size);
+    }
+
+    DV::h_view(_size) = val;
+    _size++;
+
+  };
+
+  void pop_back() {
+    _size--;
+  };
+
+  void clear() {
+    _size = 0;
+  }
+
+  size_type size() const {return _size;};
+  size_type max_size() const {return 2000000000;}
+  size_type capacity() const {return DV::capacity();};
+  bool empty() const {return _size==0;};
+
+  iterator begin() const {return &DV::h_view(0);};
+
+  iterator end() const {return &DV::h_view(_size);};
+
+
+  /* std::algorithms wich work originally with iterators, here they are implemented as member functions */
+
+  size_t
+  lower_bound (const size_t& start,
+               const size_t& theEnd,
+               const Scalar& comp_val) const
+  {
+    int lower = start; // FIXME (mfh 24 Apr 2014) narrowing conversion
+    int upper = _size > theEnd? theEnd : _size-1; // FIXME (mfh 24 Apr 2014) narrowing conversion
+    if (upper <= lower) {
+      return theEnd;
+    }
+
+    Scalar lower_val = DV::h_view(lower);
+    Scalar upper_val = DV::h_view(upper);
+    size_t idx = (upper+lower)/2;
+    Scalar val = DV::h_view(idx);
+    if(val>upper_val) return upper;
+    if(val<lower_val) return start;
+
+    while(upper>lower) {
+      if(comp_val>val) {
+        lower = ++idx;
+      } else {
+        upper = idx;
+      }
+      idx = (upper+lower)/2;
+      val = DV::h_view(idx);
+    }
+    return idx;
+  }
+
+  bool is_sorted() {
+    for(int i=0;i<_size-1;i++) {
+      if(DV::h_view(i)>DV::h_view(i+1)) return false;
+    }
+    return true;
+  }
+
+  iterator find(Scalar val) const {
+    if(_size == 0) return end();
+
+    int upper,lower,current;
+    current = _size/2;
+    upper = _size-1;
+    lower = 0;
+
+    if((val<DV::h_view(0)) || (val>DV::h_view(_size-1)) ) return end();
+
+    while(upper>lower)
+    {
+      if(val>DV::h_view(current)) lower = current+1;
+      else upper = current;
+      current = (upper+lower)/2;
+    }
+
+    if(val==DV::h_view(current)) return &DV::h_view(current);
+    else return end();
+  }
+
+  /* Additional functions for data management */
+
+  void device_to_host(){
+    deep_copy(DV::h_view,DV::d_view);
+  }
+  void host_to_device() const {
+    deep_copy(DV::d_view,DV::h_view);
+  }
+
+  void on_host() {
+    DV::modified_host() = DV::modified_device() + 1;
+  }
+  void on_device() {
+    DV::modified_device() = DV::modified_host() + 1;
+  }
+
+  void set_overallocation(float extra) {
+    _extra_storage = 1.0 + extra;
+  }
+
+
+public:
+  struct set_functor {
+    typedef typename DV::t_dev::execution_space execution_space;
+    typename DV::t_dev _data;
+    Scalar _val;
+
+    set_functor(typename DV::t_dev data, Scalar val) :
+      _data(data),_val(val) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int &i) const {
+      _data(i) = _val;
+    }
+  };
+
+  struct set_functor_host {
+    typedef typename DV::t_host::execution_space execution_space;
+    typename DV::t_host _data;
+    Scalar _val;
+
+    set_functor_host(typename DV::t_host data, Scalar val) :
+      _data(data),_val(val) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (const int &i) const {
+      _data(i) = _val;
+    }
+  };
+
+};
+
+
+}
+#endif
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..7de290e71138d5660563d5ab27fc0c86ef27762e
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp
@@ -0,0 +1,173 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BITSET_IMPL_HPP
+#define KOKKOS_BITSET_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+
+#include <cstdio>
+#include <climits>
+#include <iostream>
+#include <iomanip>
+
+namespace Kokkos { namespace Impl {
+
+KOKKOS_FORCEINLINE_FUNCTION
+unsigned rotate_right(unsigned i, int r)
+{
+  enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) };
+  return r ? ((i >> r) | (i << (size-r))) : i ;
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_forward(unsigned i)
+{
+#if defined( __CUDA_ARCH__ )
+  return __ffs(i) - 1;
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_ffs(i) - 1;
+#elif defined( __INTEL_COMPILER )
+  return _bit_scan_forward(i);
+#else
+
+  unsigned t = 1u;
+  int r = 0;
+  while (i && (i & t == 0))
+  {
+    t = t << 1;
+    ++r;
+  }
+  return r;
+#endif
+}
+
+
+KOKKOS_FORCEINLINE_FUNCTION
+int bit_scan_reverse(unsigned i)
+{
+  enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) };
+#if defined( __CUDA_ARCH__ )
+  return shift - __clz(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return shift - __builtin_clz(i);
+#elif defined( __INTEL_COMPILER )
+  return _bit_scan_reverse(i);
+#else
+  unsigned t = 1u << shift;
+  int r = 0;
+  while (i && (i & t == 0))
+  {
+    t = t >> 1;
+    ++r;
+  }
+  return r;
+#endif
+}
+
+
+// count the bits set
+KOKKOS_FORCEINLINE_FUNCTION
+int popcount(unsigned i)
+{
+#if defined( __CUDA_ARCH__ )
+  return __popc(i);
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+  return __builtin_popcount(i);
+#elif defined ( __INTEL_COMPILER )
+  return _popcnt32(i);
+#else
+  // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
+  i = i - ((i >> 1) & ~0u/3u);                                         // temp
+  i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u);                      // temp
+  i = (i + (i >> 4)) & ~0u/255u*15u;                                   // temp
+  return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count
+#endif
+}
+
+
+template <typename Bitset>
+struct BitsetCount
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::execution_space::execution_space execution_space;
+  typedef typename bitset_type::size_type size_type;
+  typedef size_type value_type;
+
+  bitset_type m_bitset;
+
+  BitsetCount( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  size_type apply() const
+  {
+    size_type count = 0u;
+    parallel_reduce(m_bitset.m_blocks.dimension_0(), *this, count);
+    return count;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & count)
+  {
+    count = 0u;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & count, const volatile size_type & incr )
+  {
+    count += incr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i, value_type & count) const
+  {
+    count += popcount(m_bitset.m_blocks[i]);
+  }
+};
+
+}} //Kokkos::Impl
+
+#endif // KOKKOS_BITSET_IMPL_HPP
+
diff --git a/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..c87bb8a3a37cb6820d31bdd691cf447b20bbd185
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_Functional_impl.hpp
@@ -0,0 +1,195 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_FUNCTIONAL_IMPL_HPP
+#define KOKKOS_FUNCTIONAL_IMPL_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <stdint.h>
+
+namespace Kokkos { namespace Impl {
+
+// MurmurHash3 was written by Austin Appleby, and is placed in the public
+// domain. The author hereby disclaims copyright to this source code.
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t getblock32 ( const uint8_t * p, int i )
+{
+// used to avoid aliasing error which could cause errors with
+// forced inlining
+  return    ((uint32_t)p[i*4+0])
+          | ((uint32_t)p[i*4+1] << 8)
+          | ((uint32_t)p[i*4+2] << 16)
+          | ((uint32_t)p[i*4+3] << 24);
+}
+
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t rotl32 ( uint32_t x, int8_t r )
+{ return (x << r) | (x >> (32 - r)); }
+
+KOKKOS_FORCEINLINE_FUNCTION
+uint32_t fmix32 ( uint32_t h )
+{
+  h ^= h >> 16;
+  h *= 0x85ebca6b;
+  h ^= h >> 13;
+  h *= 0xc2b2ae35;
+  h ^= h >> 16;
+
+  return h;
+}
+
+KOKKOS_INLINE_FUNCTION
+uint32_t MurmurHash3_x86_32 ( const void * key, int len, uint32_t seed )
+{
+  const uint8_t * data = (const uint8_t*)key;
+  const int nblocks = len / 4;
+
+  uint32_t h1 = seed;
+
+  const uint32_t c1 = 0xcc9e2d51;
+  const uint32_t c2 = 0x1b873593;
+
+  //----------
+  // body
+
+  for(int i=0; i<nblocks; ++i)
+  {
+    uint32_t k1 = getblock32(data,i);
+
+    k1 *= c1;
+    k1 = rotl32(k1,15);
+    k1 *= c2;
+
+    h1 ^= k1;
+    h1 = rotl32(h1,13);
+    h1 = h1*5+0xe6546b64;
+  }
+
+  //----------
+  // tail
+
+  const uint8_t * tail = (const uint8_t*)(data + nblocks*4);
+
+  uint32_t k1 = 0;
+
+  switch(len & 3)
+  {
+  case 3: k1 ^= tail[2] << 16;
+  case 2: k1 ^= tail[1] << 8;
+  case 1: k1 ^= tail[0];
+          k1 *= c1; k1 = rotl32(k1,15); k1 *= c2; h1 ^= k1;
+  };
+
+  //----------
+  // finalization
+
+  h1 ^= len;
+
+  h1 = fmix32(h1);
+
+  return h1;
+}
+
+
+#if defined( __GNUC__ ) /* GNU C   */ || \
+    defined( __GNUG__ ) /* GNU C++ */ || \
+    defined( __clang__ )
+
+#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__))
+
+#else
+
+#define KOKKOS_MAY_ALIAS
+
+#endif
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+bool bitwise_equal(T const * const a_ptr, T const * const b_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const * const ptr;
+    T64 const * const ptr64;
+    T32 const * const ptr32;
+    T16 const * const ptr16;
+    T8  const * const ptr8;
+  } a = {a_ptr}, b = {b_ptr};
+
+  bool result = true;
+
+  for (int i=0; i < NUM_64; ++i) {
+    result = result && a.ptr64[i] == b.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    result = result && a.ptr32[NUM_64*2] == b.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    result = result && a.ptr16[NUM_32*2] == b.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    result = result && a.ptr8[NUM_16*2] == b.ptr8[NUM_16*2];
+  }
+
+  return result;
+}
+
+
+
+#undef KOKKOS_MAY_ALIAS
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_FUNCTIONAL_IMPL_HPP
diff --git a/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..c52fc24359b8f7bd34489d94914ea304f7bc3425
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_StaticCrsGraph_factory.hpp
@@ -0,0 +1,208 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
+#define KOKKOS_IMPL_STATICCRSGRAPH_FACTORY_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return view ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view )
+{
+  // Force copy:
+  //typedef Impl::ViewAssignment< Impl::ViewDefault > alloc ; // unused
+  typedef StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >  staticcrsgraph_type ;
+
+  typename staticcrsgraph_type::HostMirror               tmp ;
+  typename staticcrsgraph_type::row_map_type::HostMirror tmp_row_map = create_mirror( view.row_map);
+
+  // Allocation to match:
+  tmp.row_map = tmp_row_map ; // Assignment of 'const' from 'non-const'
+  tmp.entries = create_mirror( view.entries );
+
+
+  // Deep copy:
+  deep_copy( tmp_row_map , view.row_map );
+  deep_copy( tmp.entries , view.entries );
+
+  return tmp ;
+}
+
+template< class DataType , class Arg1Type , class Arg2Type , typename SizeType >
+inline
+typename StaticCrsGraph< DataType , Arg1Type , Arg2Type , SizeType >::HostMirror
+create_mirror_view( const StaticCrsGraph<DataType,Arg1Type,Arg2Type,SizeType > & view ,
+                    typename Impl::enable_if< ! ViewTraits<DataType,Arg1Type,Arg2Type,void>::is_hostspace >::type * = 0 )
+{
+  return create_mirror( view );
+}
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class StaticCrsGraphType , class InputSizeType >
+inline
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< InputSizeType > & input )
+{
+  typedef StaticCrsGraphType                  output_type ;
+  //typedef std::vector< InputSizeType >  input_type ; // unused
+
+  typedef typename output_type::entries_type   entries_type ;
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::execution_space > work_type ;
+
+  output_type output ;
+
+  // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i];
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  return output ;
+}
+
+//----------------------------------------------------------------------------
+
+template< class StaticCrsGraphType , class InputSizeType >
+inline
+typename StaticCrsGraphType::staticcrsgraph_type
+create_staticcrsgraph( const std::string & label ,
+                 const std::vector< std::vector< InputSizeType > > & input )
+{
+  typedef StaticCrsGraphType                  output_type ;
+  typedef typename output_type::entries_type  entries_type ;
+
+  static_assert( entries_type::rank == 1
+               , "Graph entries view must be rank one" );
+
+  typedef View< typename output_type::size_type [] ,
+                typename output_type::array_layout ,
+                typename output_type::execution_space > work_type ;
+
+  output_type output ;
+
+    // Create the row map:
+
+  const size_t length = input.size();
+
+  {
+    work_type row_work( "tmp" , length + 1 );
+
+    typename work_type::HostMirror row_work_host =
+      create_mirror_view( row_work );
+
+    size_t sum = 0 ;
+    row_work_host[0] = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      row_work_host[i+1] = sum += input[i].size();
+    }
+
+    deep_copy( row_work , row_work_host );
+
+    output.entries   = entries_type( label , sum );
+    output.row_map   = row_work ;
+  }
+
+  // Fill in the entries:
+  {
+    typename entries_type::HostMirror host_entries =
+      create_mirror_view( output.entries );
+
+    size_t sum = 0 ;
+    for ( size_t i = 0 ; i < length ; ++i ) {
+      for ( size_t j = 0 ; j < input[i].size() ; ++j , ++sum ) {
+        host_entries( sum ) = input[i][j] ;
+      }
+    }
+
+    deep_copy( output.entries , host_entries );
+  }
+
+  return output ;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_CRSARRAY_FACTORY_HPP */
+
diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..843fd3a8089999ab80b23506c2206e7a5de325e9
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
@@ -0,0 +1,101 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_UnorderedMap.hpp>
+
+namespace Kokkos { namespace Impl {
+
+uint32_t find_hash_size(uint32_t size)
+{
+  if (size == 0u) return 0u;
+
+  // these primes try to preserve randomness of hash
+  static const uint32_t primes [] = {
+        3, 7, 13, 23, 53, 97, 193, 389, 769, 1543
+      , 2237, 2423, 2617, 2797, 2999, 3167, 3359, 3539
+      , 3727, 3911, 4441 , 4787 , 5119 , 5471 , 5801 , 6143 , 6521 , 6827
+      , 7177 , 7517 , 7853 , 8887 , 9587 , 10243 , 10937 , 11617 , 12289
+      , 12967 , 13649 , 14341 , 15013 , 15727
+      , 17749 , 19121 , 20479 , 21859 , 23209 , 24593 , 25939 , 27329
+      , 28669 , 30047 , 31469 , 35507 , 38231 , 40961 , 43711 , 46439
+      , 49157 , 51893 , 54617 , 57347 , 60077 , 62801 , 70583 , 75619
+      , 80669 , 85703 , 90749 , 95783 , 100823 , 105871 , 110909 , 115963
+      , 120997 , 126031 , 141157 , 151237 , 161323 , 171401 , 181499 , 191579
+      , 201653 , 211741 , 221813 , 231893 , 241979 , 252079
+      , 282311 , 302483 , 322649 , 342803 , 362969 , 383143 , 403301 , 423457
+      , 443629 , 463787 , 483953 , 504121 , 564617 , 604949 , 645313 , 685609
+      , 725939 , 766273 , 806609 , 846931 , 887261 , 927587 , 967919 , 1008239
+      , 1123477 , 1198397 , 1273289 , 1348177 , 1423067 , 1497983 , 1572869
+      , 1647761 , 1722667 , 1797581 , 1872461 , 1947359 , 2022253
+      , 2246953 , 2396759 , 2546543 , 2696363 , 2846161 , 2995973 , 3145739
+      , 3295541 , 3445357 , 3595117 , 3744941 , 3894707 , 4044503
+      , 4493921 , 4793501 , 5093089 , 5392679 , 5692279 , 5991883 , 6291469
+      , 6591059 , 6890641 , 7190243 , 7489829 , 7789447 , 8089033
+      , 8987807 , 9586981 , 10186177 , 10785371 , 11384539 , 11983729
+      , 12582917 , 13182109 , 13781291 , 14380469 , 14979667 , 15578861
+      , 16178053 , 17895707 , 19014187 , 20132683 , 21251141 , 22369661
+      , 23488103 , 24606583 , 25725083 , 26843549 , 27962027 , 29080529
+      , 30198989 , 31317469 , 32435981 , 35791397 , 38028379 , 40265327
+      , 42502283 , 44739259 , 46976221 , 49213237 , 51450131 , 53687099
+      , 55924061 , 58161041 , 60397993 , 62634959 , 64871921
+      , 71582857 , 76056727 , 80530643 , 85004567 , 89478503 , 93952427
+      , 98426347 , 102900263 , 107374217 , 111848111 , 116322053 , 120795971
+      , 125269877 , 129743807 , 143165587 , 152113427 , 161061283 , 170009141
+      , 178956983 , 187904819 , 196852693 , 205800547 , 214748383 , 223696237
+      , 232644089 , 241591943 , 250539763 , 259487603 , 268435399
+  };
+
+  const uint32_t num_primes = sizeof(primes)/sizeof(uint32_t);
+
+  uint32_t hsize = primes[num_primes-1] ;
+  for (uint32_t i = 0; i < num_primes; ++i) {
+    if (size <= primes[i]) {
+      hsize = primes[i];
+      break;
+    }
+  }
+  return hsize;
+}
+
+}} // namespace Kokkos::Impl
+
diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..b788c966e9c5a04d0ce4ca626190d241ec273008
--- /dev/null
+++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp
@@ -0,0 +1,297 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_UNORDERED_MAP_IMPL_HPP
+#define KOKKOS_UNORDERED_MAP_IMPL_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <stdint.h>
+
+#include <cstdio>
+#include <climits>
+#include <iostream>
+#include <iomanip>
+
+namespace Kokkos { namespace Impl {
+
+uint32_t find_hash_size( uint32_t size );
+
+template <typename Map>
+struct UnorderedMapRehash
+{
+  typedef Map map_type;
+  typedef typename map_type::const_map_type const_map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+
+  map_type       m_dst;
+  const_map_type m_src;
+
+  UnorderedMapRehash( map_type const& dst, const_map_type const& src)
+    : m_dst(dst), m_src(src)
+  {}
+
+  void apply() const
+  {
+    parallel_for(m_src.capacity(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(size_type i) const
+  {
+    if ( m_src.valid_at(i) )
+      m_dst.insert(m_src.key_at(i), m_src.value_at(i));
+  }
+
+};
+
+template <typename UMap>
+struct UnorderedMapErase
+{
+  typedef UMap map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+  typedef typename map_type::key_type key_type;
+  typedef typename map_type::impl_value_type value_type;
+
+  map_type m_map;
+
+  UnorderedMapErase( map_type const& map)
+    : m_map(map)
+  {}
+
+  void apply() const
+  {
+    parallel_for(m_map.m_hash_lists.dimension_0(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    size_type curr = m_map.m_hash_lists(i);
+    size_type next = invalid_index;
+
+    // remove erased head of the linked-list
+    while (curr != invalid_index && !m_map.valid_at(curr)) {
+      next = m_map.m_next_index[curr];
+      m_map.m_next_index[curr] = invalid_index;
+      m_map.m_keys[curr] = key_type();
+      if (m_map.is_set) m_map.m_values[curr] = value_type();
+      curr = next;
+      m_map.m_hash_lists(i) = next;
+    }
+
+    // if the list is non-empty and the head is valid
+    if (curr != invalid_index && m_map.valid_at(curr) ) {
+      size_type prev = curr;
+      curr = m_map.m_next_index[prev];
+
+      while (curr != invalid_index) {
+        next = m_map.m_next_index[curr];
+        if (m_map.valid_at(curr)) {
+          prev = curr;
+        }
+        else {
+          // remove curr from list
+          m_map.m_next_index[prev] = next;
+          m_map.m_next_index[curr] = invalid_index;
+          m_map.m_keys[curr] = key_type();
+          if (map_type::is_set) m_map.m_values[curr] = value_type();
+        }
+        curr = next;
+      }
+    }
+  }
+};
+
+template <typename UMap>
+struct UnorderedMapHistogram
+{
+  typedef UMap map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+
+  typedef View<int[100], execution_space> histogram_view;
+  typedef typename histogram_view::HostMirror host_histogram_view;
+
+  map_type m_map;
+  histogram_view m_length;
+  histogram_view m_distance;
+  histogram_view m_block_distance;
+
+  UnorderedMapHistogram( map_type const& map)
+    : m_map(map)
+    , m_length("UnorderedMap Histogram")
+    , m_distance("UnorderedMap Histogram")
+    , m_block_distance("UnorderedMap Histogram")
+  {}
+
+  void calculate()
+  {
+    parallel_for(m_map.m_hash_lists.dimension_0(), *this);
+  }
+
+  void clear()
+  {
+    Kokkos::deep_copy(m_length, 0);
+    Kokkos::deep_copy(m_distance, 0);
+    Kokkos::deep_copy(m_block_distance, 0);
+  }
+
+  void print_length(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_length);
+    Kokkos::deep_copy(host_copy, m_length);
+
+    for (int i=0, size = host_copy.dimension_0(); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  void print_distance(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_distance);
+    Kokkos::deep_copy(host_copy, m_distance);
+
+    for (int i=0, size = host_copy.dimension_0(); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  void print_block_distance(std::ostream &out)
+  {
+    host_histogram_view host_copy = create_mirror_view(m_block_distance);
+    Kokkos::deep_copy(host_copy, m_block_distance);
+
+    for (int i=0, size = host_copy.dimension_0(); i<size; ++i)
+    {
+      out << host_copy[i] << " , ";
+    }
+    out << "\b\b\b   " << std::endl;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    uint32_t length = 0;
+    size_type min_index = ~0u, max_index = 0;
+    for (size_type curr = m_map.m_hash_lists(i); curr != invalid_index; curr = m_map.m_next_index[curr]) {
+      ++length;
+      min_index = (curr < min_index) ? curr : min_index;
+      max_index = (max_index < curr) ? curr : max_index;
+    }
+
+    size_type distance = (0u < length) ? max_index - min_index : 0u;
+    size_type blocks = (0u < length) ? max_index/32u - min_index/32u : 0u;
+
+    // normalize data
+    length   = length   < 100u ? length   : 99u;
+    distance = distance < 100u ? distance : 99u;
+    blocks   = blocks   < 100u ? blocks   : 99u;
+
+    if (0u < length)
+    {
+      atomic_fetch_add( &m_length(length), 1);
+      atomic_fetch_add( &m_distance(distance), 1);
+      atomic_fetch_add( &m_block_distance(blocks), 1);
+    }
+  }
+};
+
+template <typename UMap>
+struct UnorderedMapPrint
+{
+  typedef UMap map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef typename map_type::size_type size_type;
+
+  map_type m_map;
+
+  UnorderedMapPrint( map_type const& map)
+    : m_map(map)
+  {}
+
+  void apply()
+  {
+    parallel_for(m_map.m_hash_lists.dimension_0(), *this);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type i ) const
+  {
+    const size_type invalid_index = map_type::invalid_index;
+
+    uint32_t list = m_map.m_hash_lists(i);
+    for (size_type curr = list, ii=0; curr != invalid_index; curr = m_map.m_next_index[curr], ++ii) {
+      printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr), m_map.value_at(curr));
+    }
+  }
+};
+
+template <typename DKey, typename DValue, typename SKey, typename SValue>
+struct UnorderedMapCanAssign : public false_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<Key,Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,const Value,Key,Value> : public true_ {};
+
+template <typename Key, typename Value>
+struct UnorderedMapCanAssign<const Key,const Value,const Key,Value> : public true_ {};
+
+
+}} //Kokkos::Impl
+
+#endif // KOKKOS_UNORDERED_MAP_IMPL_HPP
diff --git a/lib/kokkos/containers/unit_tests/Makefile b/lib/kokkos/containers/unit_tests/Makefile
new file mode 100755
index 0000000000000000000000000000000000000000..176bfa906e54fe4a6212702944bc43bff36c7957
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/Makefile
@@ -0,0 +1,92 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/containers/unit_tests
+
+default: build_all
+	echo "End Build"
+	
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = nvcc_wrapper
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests
+
+TEST_TARGETS = 
+TARGETS = 
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+	OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosContainers_UnitTest_Serial
+	TEST_TARGETS += test-serial
+endif
+
+KokkosContainers_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Cuda
+
+KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Threads
+	
+KokkosContainers_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_OpenMP
+
+KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_UnitTest_Serial
+
+test-cuda: KokkosContainers_UnitTest_Cuda
+	./KokkosContainers_UnitTest_Cuda
+
+test-threads: KokkosContainers_UnitTest_Threads
+	./KokkosContainers_UnitTest_Threads
+
+test-openmp: KokkosContainers_UnitTest_OpenMP
+	./KokkosContainers_UnitTest_OpenMP
+
+test-serial: KokkosContainers_UnitTest_Serial
+	./KokkosContainers_UnitTest_Serial
+	
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+	
+clean: kokkos-clean 
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/lib/kokkos/containers/unit_tests/TestBitset.hpp b/lib/kokkos/containers/unit_tests/TestBitset.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..76fb30edcb68aa37f7beb55352212211bcf586c3
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestBitset.hpp
@@ -0,0 +1,285 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_BITSET_HPP
+#define KOKKOS_TEST_BITSET_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+
+namespace Test {
+
+namespace Impl {
+
+template <typename Bitset, bool Set>
+struct TestBitset
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  bitset_type m_bitset;
+
+  TestBitset( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  unsigned testit(unsigned collisions)
+  {
+    execution_space::fence();
+
+    unsigned count = 0;
+    Kokkos::parallel_reduce( m_bitset.size()*collisions, *this, count);
+    return count;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    i = i % m_bitset.size();
+    if (Set) {
+      if (m_bitset.set(i)) {
+        if (m_bitset.test(i)) ++v;
+      }
+    }
+    else {
+      if (m_bitset.reset(i)) {
+        if (!m_bitset.test(i)) ++v;
+      }
+    }
+  }
+
+};
+
+template <typename Bitset>
+struct TestBitsetTest
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  bitset_type m_bitset;
+
+  TestBitsetTest( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  unsigned testit()
+  {
+    execution_space::fence();
+
+    unsigned count = 0;
+    Kokkos::parallel_reduce( m_bitset.size(), *this, count);
+    return count;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    if (m_bitset.test( i )) ++v;
+  }
+};
+
+template <typename Bitset, bool Set>
+struct TestBitsetAny
+{
+  typedef Bitset bitset_type;
+  typedef typename bitset_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  bitset_type m_bitset;
+
+  TestBitsetAny( bitset_type const& bitset)
+    : m_bitset(bitset)
+  {}
+
+  unsigned testit()
+  {
+    execution_space::fence();
+
+    unsigned count = 0;
+    Kokkos::parallel_reduce( m_bitset.size(), *this, count);
+    return count;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & v ) const { v = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst, const volatile value_type & src ) const
+  { dst += src; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & v) const
+  {
+    bool result = false;
+    unsigned attempts = 0;
+    uint32_t hint = (i >> 4) << 4;
+    while (attempts < m_bitset.max_hint()) {
+      if (Set) {
+        Kokkos::tie(result, hint) = m_bitset.find_any_unset_near(hint, i);
+        if (result && m_bitset.set(hint)) {
+          ++v;
+          break;
+        }
+        else if (!result) {
+          ++attempts;
+        }
+      }
+      else {
+        Kokkos::tie(result, hint) = m_bitset.find_any_set_near(hint, i);
+        if (result && m_bitset.reset(hint)) {
+          ++v;
+          break;
+        }
+        else if (!result) {
+          ++attempts;
+        }
+      }
+    }
+  }
+
+};
+} // namespace Impl
+
+
+
+template <typename Device>
+void test_bitset()
+{
+  typedef Kokkos::Bitset< Device > bitset_type;
+  typedef Kokkos::ConstBitset< Device > const_bitset_type;
+
+  //unsigned test_sizes[] = { 0u, 1000u, 1u<<14, 1u<<16, 10000001 };
+  unsigned test_sizes[] = { 1000u, 1u<<14, 1u<<16, 10000001 };
+
+  for (int i=0, end = sizeof(test_sizes)/sizeof(unsigned); i<end; ++i) {
+
+    //std::cout << "Bitset " << test_sizes[i] << std::endl;
+
+    bitset_type bitset(test_sizes[i]);
+
+    //std::cout << "  Check inital count " << std::endl;
+    // nothing should be set
+    {
+      Impl::TestBitsetTest< bitset_type > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ(0u, count);
+      EXPECT_EQ(count, bitset.count());
+    }
+
+    //std::cout << "  Check set() " << std::endl;
+    bitset.set();
+    // everything should be set
+    {
+      Impl::TestBitsetTest< const_bitset_type > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ(bitset.size(), count);
+      EXPECT_EQ(count, bitset.count());
+    }
+
+    //std::cout << "  Check reset() " << std::endl;
+    bitset.reset();
+    EXPECT_EQ(0u, bitset.count());
+
+    //std::cout << "  Check set(i) " << std::endl;
+    // test setting bits
+    {
+      Impl::TestBitset< bitset_type, true > f(bitset);
+      uint32_t count = f.testit(10u);
+      EXPECT_EQ( bitset.size(), bitset.count());
+      EXPECT_EQ( bitset.size(), count );
+    }
+
+    //std::cout << "  Check reset(i) " << std::endl;
+    // test resetting bits
+    {
+      Impl::TestBitset< bitset_type, false > f(bitset);
+      uint32_t count = f.testit(10u);
+      EXPECT_EQ( bitset.size(), count);
+      EXPECT_EQ( 0u, bitset.count() );
+    }
+
+
+    //std::cout << "  Check find_any_set(i) " << std::endl;
+    // test setting any bits
+    {
+      Impl::TestBitsetAny< bitset_type, true > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ( bitset.size(), bitset.count());
+      EXPECT_EQ( bitset.size(), count );
+    }
+
+    //std::cout << "  Check find_any_unset(i) " << std::endl;
+    // test resetting any bits
+    {
+      Impl::TestBitsetAny< bitset_type, false > f(bitset);
+      uint32_t count = f.testit();
+      EXPECT_EQ( bitset.size(), count);
+      EXPECT_EQ( 0u, bitset.count() );
+    }
+
+  }
+
+}
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_BITSET_HPP
+
diff --git a/lib/kokkos/containers/unit_tests/TestComplex.hpp b/lib/kokkos/containers/unit_tests/TestComplex.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..a2769fd1175d5e76b68c5a415fcce4d0573e6656
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestComplex.hpp
@@ -0,0 +1,264 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+
+#ifndef KOKKOS_TEST_COMPLEX_HPP
+#define KOKKOS_TEST_COMPLEX_HPP
+
+#include <Kokkos_Complex.hpp>
+#include <gtest/gtest.h>
+#include <iostream>
+
+namespace Test {
+
+namespace Impl {
+  template <typename RealType>
+  void testComplexConstructors () {
+    typedef Kokkos::complex<RealType> complex_type;
+
+    complex_type z1;
+    complex_type z2 (0.0, 0.0);
+    complex_type z3 (1.0, 0.0);
+    complex_type z4 (0.0, 1.0);
+    complex_type z5 (-1.0, -2.0);
+
+    ASSERT_TRUE( z1 == z2 );
+    ASSERT_TRUE( z1 != z3 );
+    ASSERT_TRUE( z1 != z4 );
+    ASSERT_TRUE( z1 != z5 );
+
+    ASSERT_TRUE( z2 != z3 );
+    ASSERT_TRUE( z2 != z4 );
+    ASSERT_TRUE( z2 != z5 );
+
+    ASSERT_TRUE( z3 != z4 );
+    ASSERT_TRUE( z3 != z5 );
+
+    complex_type z6 (-1.0, -2.0);
+    ASSERT_TRUE( z5 == z6 );
+
+    // Make sure that complex has value semantics, in particular, that
+    // equality tests use values and not pointers, so that
+    // reassignment actually changes the value.
+    z1 = complex_type (-3.0, -4.0);
+    ASSERT_TRUE( z1.real () == -3.0 );
+    ASSERT_TRUE( z1.imag () == -4.0 );
+    ASSERT_TRUE( z1 != z2 );
+
+    complex_type z7 (1.0);
+    ASSERT_TRUE( z3 == z7 );
+    ASSERT_TRUE( z7 == 1.0 );
+    ASSERT_TRUE( z7 != -1.0 );
+
+    z7 = complex_type (5.0);
+    ASSERT_TRUE( z7.real () == 5.0 );
+    ASSERT_TRUE( z7.imag () == 0.0 );
+  }
+
+  template <typename RealType>
+  void testPlus () {
+    typedef Kokkos::complex<RealType> complex_type;
+
+    complex_type z1 (1.0, -1.0);
+    complex_type z2 (-1.0, 1.0);
+    complex_type z3 = z1 + z2;
+    ASSERT_TRUE( z3 == complex_type (0.0, 0.0) );
+  }
+
+  template <typename RealType>
+  void testMinus () {
+    typedef Kokkos::complex<RealType> complex_type;
+
+    // Test binary minus.
+    complex_type z1 (1.0, -1.0);
+    complex_type z2 (-1.0, 1.0);
+    complex_type z3 = z1 - z2;
+    ASSERT_TRUE( z3 == complex_type (2.0, -2.0) );
+
+    // Test unary minus.
+    complex_type z4 (3.0, -4.0);
+    ASSERT_TRUE( -z1 == complex_type (-3.0, 4.0) );
+  }
+
+  template <typename RealType>
+  void testTimes () {
+    typedef Kokkos::complex<RealType> complex_type;
+
+    complex_type z1 (1.0, -1.0);
+    complex_type z2 (-1.0, 1.0);
+    complex_type z3 = z1 - z2;
+    ASSERT_TRUE( z3 == complex_type (2.0, -2.0) );
+
+    // Test unary minus.
+    complex_type z4 (3.0, -4.0);
+    ASSERT_TRUE( z4 == complex_type (3.0, -4.0) );
+    ASSERT_TRUE( -z4 == complex_type (-3.0, 4.0) );
+    ASSERT_TRUE( z4 == -complex_type (-3.0, 4.0) );
+  }
+
+  template <typename RealType>
+  void testDivide () {
+    typedef Kokkos::complex<RealType> complex_type;
+
+    // Test division of a complex number by a real number.
+    complex_type z1 (1.0, -1.0);
+    complex_type z2 (1.0 / 2.0, -1.0 / 2.0);
+    ASSERT_TRUE( z1 / 2.0 == z2 );
+
+    // (-1+2i)/(1-i) == ((-1+2i)(1+i)) / ((1-i)(1+i))
+    // (-1+2i)(1+i) == -3 + i
+    complex_type z3 (-1.0, 2.0);
+    complex_type z4 (1.0, -1.0);
+    complex_type z5 (-3.0, 1.0);
+    ASSERT_TRUE(z3 * Kokkos::conj (z4) == z5 );
+
+    // Test division of a complex number by a complex number.
+    // This assumes that RealType is a floating-point type.
+    complex_type z6 (Kokkos::real (z5) / 2.0,
+                     Kokkos::imag (z5) / 2.0);
+
+    complex_type z7 = z3 / z4;
+    ASSERT_TRUE( z7 == z6 );
+  }
+
+  template <typename RealType>
+  void testOutsideKernel () {
+    testComplexConstructors<RealType> ();
+    testPlus<RealType> ();
+    testTimes<RealType> ();
+    testDivide<RealType> ();
+  }
+
+
+  template<typename RealType, typename Device>
+  void testCreateView () {
+    typedef Kokkos::complex<RealType> complex_type;
+    Kokkos::View<complex_type*, Device> x ("x", 10);
+    ASSERT_TRUE( x.dimension_0 () == 10 );
+
+    // Test that View assignment works.
+    Kokkos::View<complex_type*, Device> x_nonconst = x;
+    Kokkos::View<const complex_type*, Device> x_const = x;
+  }
+
+  template<typename RealType, typename Device>
+  class Fill {
+  public:
+    typedef typename Device::execution_space execution_space;
+
+    typedef Kokkos::View<Kokkos::complex<RealType>*, Device> view_type;
+    typedef typename view_type::size_type size_type;
+
+    KOKKOS_INLINE_FUNCTION
+    void operator () (const size_type i) const {
+      x_(i) = val_;
+    }
+
+    Fill (const view_type& x, const Kokkos::complex<RealType>& val) :
+      x_ (x), val_ (val)
+    {}
+
+  private:
+    view_type x_;
+    const Kokkos::complex<RealType> val_;
+  };
+
+  template<typename RealType, typename Device>
+  class Sum {
+  public:
+    typedef typename Device::execution_space execution_space;
+
+    typedef Kokkos::View<const Kokkos::complex<RealType>*, Device> view_type;
+    typedef typename view_type::size_type size_type;
+    typedef Kokkos::complex<RealType> value_type;    
+
+    KOKKOS_INLINE_FUNCTION
+    void operator () (const size_type i, Kokkos::complex<RealType>& sum) const {
+      sum += x_(i);
+    }
+
+    Sum (const view_type& x) : x_ (x) {}
+
+  private:
+    view_type x_;
+  };
+
+  template<typename RealType, typename Device>
+  void testInsideKernel () {
+    typedef Kokkos::complex<RealType> complex_type;
+    typedef Kokkos::View<complex_type*, Device> view_type;
+    typedef typename view_type::size_type size_type;
+
+    const size_type N = 1000;
+    view_type x ("x", N);
+    ASSERT_TRUE( x.dimension_0 () == N );
+
+    // Kokkos::parallel_reduce (N, [=] (const size_type i, complex_type& result) {
+    //     result += x[i];
+    //   });
+
+    Kokkos::parallel_for (N, Fill<RealType, Device> (x, complex_type (1.0, -1.0)));
+
+    complex_type sum;
+    Kokkos::parallel_reduce (N, Sum<RealType, Device> (x), sum);
+
+    ASSERT_TRUE( sum.real () == 1000.0 && sum.imag () == -1000.0 );
+  }
+} // namespace Impl
+
+
+template <typename Device>
+void testComplex ()
+{
+  Impl::testOutsideKernel<float> ();
+  Impl::testOutsideKernel<double> ();
+
+  Impl::testCreateView<float, Device> ();
+  Impl::testCreateView<double, Device> ();
+
+  Impl::testInsideKernel<float, Device> ();
+  Impl::testInsideKernel<double, Device> ();
+}
+
+
+} // namespace Test
+
+#endif // KOKKOS_TEST_COMPLEX_HPP
diff --git a/lib/kokkos/containers/unit_tests/TestCuda.cpp b/lib/kokkos/containers/unit_tests/TestCuda.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..2f79205c491f22ec067b44a24f8bfc5323504e9e
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestCuda.cpp
@@ -0,0 +1,206 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <iomanip>
+#include <stdint.h>
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestSegmentedView.hpp>
+
+//----------------------------------------------------------------------------
+
+
+#ifdef KOKKOS_HAVE_CUDA
+
+namespace Test {
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+TEST_F( cuda , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Cuda >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Cuda >();
+}
+
+
+void cuda_test_insert_close(  uint32_t num_nodes
+                            , uint32_t num_inserts
+                            , uint32_t num_duplicates
+                           )
+{
+  test_insert< Kokkos::Cuda >( num_nodes, num_inserts, num_duplicates, true);
+}
+
+void cuda_test_insert_far(  uint32_t num_nodes
+                          , uint32_t num_inserts
+                          , uint32_t num_duplicates
+                         )
+{
+  test_insert< Kokkos::Cuda >( num_nodes, num_inserts, num_duplicates, false);
+}
+
+void cuda_test_failed_insert(  uint32_t num_nodes )
+{
+  test_failed_insert< Kokkos::Cuda >( num_nodes );
+}
+
+void cuda_test_deep_copy(  uint32_t num_nodes )
+{
+  test_deep_copy< Kokkos::Cuda >( num_nodes );
+}
+
+void cuda_test_vector_combinations(unsigned int size)
+{
+  test_vector_combinations<int,Kokkos::Cuda>(size);
+}
+
+void cuda_test_dualview_combinations(unsigned int size)
+{
+  test_dualview_combinations<int,Kokkos::Cuda>(size);
+}
+
+void cuda_test_segmented_view(unsigned int size)
+{
+  test_segmented_view<double,Kokkos::Cuda>(size);
+}
+
+void cuda_test_bitset()
+{
+  test_bitset<Kokkos::Cuda>();
+}
+
+
+
+/*TEST_F( cuda, bitset )
+{
+  cuda_test_bitset();
+}*/
+
+#define CUDA_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat )                                \
+  TEST_F( cuda, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      cuda_test_insert_##name(num_nodes,num_inserts,num_duplicates);                                            \
+  }
+
+#define CUDA_FAILED_INSERT_TEST( num_nodes, repeat )                           \
+  TEST_F( cuda, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      cuda_test_failed_insert(num_nodes);                                      \
+  }
+
+#define CUDA_ASSIGNEMENT_TEST( num_nodes, repeat )                               \
+  TEST_F( cuda, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {  \
+    for (int i=0; i<repeat; ++i)                                                 \
+      cuda_test_assignment_operators(num_nodes);                                 \
+  }
+
+#define CUDA_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( cuda, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      cuda_test_deep_copy(num_nodes);                     \
+  }
+
+#define CUDA_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( cuda, vector_combination##size##x) {       \
+      cuda_test_vector_combinations(size);                     \
+  }
+
+#define CUDA_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( cuda, dualview_combination##size##x) {       \
+      cuda_test_dualview_combinations(size);                     \
+  }
+
+#define CUDA_SEGMENTEDVIEW_TEST( size )                             \
+  TEST_F( cuda, segmentedview_##size##x) {       \
+      cuda_test_segmented_view(size);                     \
+  }
+
+CUDA_DUALVIEW_COMBINE_TEST( 10 )
+CUDA_VECTOR_COMBINE_TEST( 10 )
+CUDA_VECTOR_COMBINE_TEST( 3057 )
+
+
+CUDA_INSERT_TEST(close,               100000, 90000, 100, 500)
+CUDA_INSERT_TEST(far,                 100000, 90000, 100, 500)
+CUDA_DEEP_COPY( 10000, 1 )
+CUDA_FAILED_INSERT_TEST( 10000, 1000 )
+CUDA_SEGMENTEDVIEW_TEST( 200 )
+
+
+#undef CUDA_INSERT_TEST
+#undef CUDA_FAILED_INSERT_TEST
+#undef CUDA_ASSIGNEMENT_TEST
+#undef CUDA_DEEP_COPY
+#undef CUDA_VECTOR_COMBINE_TEST
+#undef CUDA_DUALVIEW_COMBINE_TEST
+#undef CUDA_SEGMENTEDVIEW_TEST
+}
+
+#endif  /* #ifdef KOKKOS_HAVE_CUDA */
+
diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..e72c69f7d41cf7d493becfcbb863e5f1d9f6679f
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp
@@ -0,0 +1,121 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_DUALVIEW_HPP
+#define KOKKOS_TEST_DUALVIEW_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+  template <typename Scalar, class Device>
+  struct test_dualview_combinations
+  {
+    typedef test_dualview_combinations<Scalar,Device> self_type;
+
+    typedef Scalar scalar_type;
+    typedef Device execution_space;
+
+    Scalar reference;
+    Scalar result;
+
+    template <typename ViewType>
+    Scalar run_me(unsigned int n,unsigned int m){
+      if(n<10) n = 10;
+      if(m<3) m = 3;
+      ViewType a("A",n,m);
+
+      Kokkos::deep_copy( a.d_view , 1 );
+
+      a.template modify<typename ViewType::execution_space>();
+      a.template sync<typename ViewType::host_mirror_space>();
+
+      a.h_view(5,1) = 3;
+      a.h_view(6,1) = 4;
+      a.h_view(7,2) = 5;
+      a.template modify<typename ViewType::host_mirror_space>();
+      ViewType b = Kokkos::subview(a,std::pair<unsigned int, unsigned int>(6,9),std::pair<unsigned int, unsigned int>(0,1));
+      a.template sync<typename ViewType::execution_space>();
+      b.template modify<typename ViewType::execution_space>();
+
+      Kokkos::deep_copy( b.d_view , 2 );
+
+      a.template sync<typename ViewType::host_mirror_space>();
+      Scalar count = 0;
+      for(unsigned int i = 0; i<a.d_view.dimension_0(); i++)
+        for(unsigned int j = 0; j<a.d_view.dimension_1(); j++)
+          count += a.h_view(i,j);
+      return count -  a.d_view.dimension_0()*a.d_view.dimension_1()-2-4-3*2;
+    }
+
+
+    test_dualview_combinations(unsigned int size)
+    {
+      result = run_me< Kokkos::DualView<Scalar**,Kokkos::LayoutLeft,Device> >(size,3);
+    }
+
+   };
+
+} // namespace Impl
+
+
+
+
+template <typename Scalar, typename Device>
+void test_dualview_combinations(unsigned int size)
+{
+  Impl::test_dualview_combinations<Scalar,Device> test(size);
+  ASSERT_EQ( test.result,0);
+
+}
+
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..0ff9b4f66b640b5b2bffa98a050e8bb6df33aaa3
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp
@@ -0,0 +1,162 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+//----------------------------------------------------------------------------
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestSegmentedView.hpp>
+#include <TestComplex.hpp>
+
+#include <iomanip>
+
+namespace Test {
+
+#ifdef KOKKOS_HAVE_OPENMP
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned threads_count = 4 ;
+
+    if ( Kokkos::hwloc::available() ) {
+      threads_count = Kokkos::hwloc::get_available_numa_count() *
+                      Kokkos::hwloc::get_available_cores_per_numa();
+    }
+
+    Kokkos::OpenMP::initialize( threads_count );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMP::finalize();
+  }
+};
+
+TEST_F( openmp, complex )
+{
+  testComplex<Kokkos::OpenMP> ();
+}
+
+TEST_F( openmp, bitset )
+{
+  test_bitset<Kokkos::OpenMP>();
+}
+
+TEST_F( openmp , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::OpenMP >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::OpenMP >();
+}
+
+#define OPENMP_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near )                                \
+  TEST_F( openmp, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      test_insert<Kokkos::OpenMP>(num_nodes,num_inserts,num_duplicates, near);                                   \
+  }
+
+#define OPENMP_FAILED_INSERT_TEST( num_nodes, repeat )                         \
+  TEST_F( openmp, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {     \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_failed_insert<Kokkos::OpenMP>(num_nodes);                             \
+  }
+
+#define OPENMP_ASSIGNEMENT_TEST( num_nodes, repeat )                             \
+  TEST_F( openmp, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_assignement_operators<Kokkos::OpenMP>(num_nodes);                     \
+  }
+
+#define OPENMP_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( openmp, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_deep_copy<Kokkos::OpenMP>(num_nodes);                     \
+  }
+
+#define OPENMP_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( openmp, vector_combination##size##x) {       \
+      test_vector_combinations<int,Kokkos::OpenMP>(size);                     \
+  }
+
+#define OPENMP_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( openmp, dualview_combination##size##x) {       \
+      test_dualview_combinations<int,Kokkos::OpenMP>(size);                     \
+  }
+
+#define OPENMP_SEGMENTEDVIEW_TEST( size )                             \
+  TEST_F( openmp, segmentedview_##size##x) {       \
+      test_segmented_view<double,Kokkos::OpenMP>(size);                     \
+  }
+
+OPENMP_INSERT_TEST(close, 100000, 90000, 100, 500, true)
+OPENMP_INSERT_TEST(far, 100000, 90000, 100, 500, false)
+OPENMP_FAILED_INSERT_TEST( 10000, 1000 )
+OPENMP_DEEP_COPY( 10000, 1 )
+
+OPENMP_VECTOR_COMBINE_TEST( 10 )
+OPENMP_VECTOR_COMBINE_TEST( 3057 )
+OPENMP_DUALVIEW_COMBINE_TEST( 10 )
+OPENMP_SEGMENTEDVIEW_TEST( 10000 )
+
+#undef OPENMP_INSERT_TEST
+#undef OPENMP_FAILED_INSERT_TEST
+#undef OPENMP_ASSIGNEMENT_TEST
+#undef OPENMP_DEEP_COPY
+#undef OPENMP_VECTOR_COMBINE_TEST
+#undef OPENMP_DUALVIEW_COMBINE_TEST
+#undef OPENMP_SEGMENTEDVIEW_TEST
+#endif
+} // namespace test
+
diff --git a/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp b/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..3da4bc781bd31c23bf4b9283f343670c37d820d2
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestSegmentedView.hpp
@@ -0,0 +1,708 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP
+#define KOKKOS_TEST_SEGMENTEDVIEW_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <Kokkos_Core.hpp>
+
+#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+#include <Kokkos_SegmentedView.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+  template<class ViewType , class ExecutionSpace, int Rank = ViewType::Rank>
+  struct GrowTest;
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 1> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+team_member.team_size());
+      value += team_idx + team_member.team_rank();
+
+      if((a.dimension_0()>team_idx+team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+team_member.team_rank()))
+        a(team_idx+team_member.team_rank()) = team_idx+team_member.team_rank();
+
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 2> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        value += team_idx + team_member.team_rank() + 13*k;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++) {
+          a(team_idx+ team_member.team_rank(),k) =
+              team_idx+ team_member.team_rank() + 13*k;
+        }
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 3> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          value += team_idx + team_member.team_rank() + 13*k + 3*l;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            a(team_idx+ team_member.team_rank(),k,l) =
+                team_idx+ team_member.team_rank() + 13*k + 3*l;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 4> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          for( typename ExecutionSpace::size_type m=0;m<2;m++)
+            value += team_idx + team_member.team_rank() + 13*k + 3*l + 7*m;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              a(team_idx+ team_member.team_rank(),k,l,m) =
+                  team_idx+ team_member.team_rank() + 13*k + 3*l + 7*m;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 5> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          for( typename ExecutionSpace::size_type m=0;m<2;m++)
+            for( typename ExecutionSpace::size_type n=0;n<3;n++)
+              value +=
+                  team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                a(team_idx+ team_member.team_rank(),k,l,m,n) =
+                  team_idx+ team_member.team_rank() + 13*k + 3*l + 7*m + 5*n;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 6> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          for( typename ExecutionSpace::size_type m=0;m<2;m++)
+            for( typename ExecutionSpace::size_type n=0;n<3;n++)
+              for( typename ExecutionSpace::size_type o=0;o<2;o++)
+              value +=
+                  team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                a(team_idx+ team_member.team_rank(),k,l,m,n,o) =
+                    team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o ;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 7> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      a.grow(team_member , team_idx+ team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          for( typename ExecutionSpace::size_type m=0;m<2;m++)
+            for( typename ExecutionSpace::size_type n=0;n<3;n++)
+              for( typename ExecutionSpace::size_type o=0;o<2;o++)
+                for( typename ExecutionSpace::size_type p=0;p<4;p++)
+              value +=
+                  team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                  for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
+                a(team_idx+ team_member.team_rank(),k,l,m,n,o,p) =
+                    team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p ;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct GrowTest<ViewType , ExecutionSpace , 8> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    GrowTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+      a.grow(team_member , team_idx + team_member.team_size());
+
+      for( typename ExecutionSpace::size_type k=0;k<7;k++)
+        for( typename ExecutionSpace::size_type l=0;l<3;l++)
+          for( typename ExecutionSpace::size_type m=0;m<2;m++)
+            for( typename ExecutionSpace::size_type n=0;n<3;n++)
+              for( typename ExecutionSpace::size_type o=0;o<2;o++)
+                for( typename ExecutionSpace::size_type p=0;p<4;p++)
+                  for( typename ExecutionSpace::size_type q=0;q<3;q++)
+              value +=
+                  team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q;
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                  for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
+                    for( typename ExecutionSpace::size_type q=0;q<a.dimension_7();q++)
+                a(team_idx+ team_member.team_rank(),k,l,m,n,o,p,q) =
+                    team_idx + team_member.team_rank() + 13*k + 3*l + 7*m + 5*n + 2*o + 15*p + 17*q;
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace, int Rank = ViewType::Rank>
+  struct VerifyTest;
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 1> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        value += a(team_idx+ team_member.team_rank());
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 2> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          value += a(team_idx+ team_member.team_rank(),k);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 3> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            value += a(team_idx+ team_member.team_rank(),k,l);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 4> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              value += a(team_idx+ team_member.team_rank(),k,l,m);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 5> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                value += a(team_idx+ team_member.team_rank(),k,l,m,n);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 6> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                  value += a(team_idx+ team_member.team_rank(),k,l,m,n,o);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 7> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                  for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
+                    value += a(team_idx+ team_member.team_rank(),k,l,m,n,o,p);
+      }
+    }
+  };
+
+  template<class ViewType , class ExecutionSpace>
+  struct VerifyTest<ViewType , ExecutionSpace , 8> {
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+    typedef typename Policy::member_type team_type;
+    typedef double value_type;
+
+    ViewType a;
+
+    VerifyTest(ViewType in):a(in) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (team_type team_member, double& value) const {
+      unsigned int team_idx = team_member.league_rank() * team_member.team_size();
+
+      if((a.dimension_0()>team_idx+ team_member.team_rank()) &&
+         (a.dimension(0)>team_idx+ team_member.team_rank())) {
+        for( typename ExecutionSpace::size_type k=0;k<a.dimension_1();k++)
+          for( typename ExecutionSpace::size_type l=0;l<a.dimension_2();l++)
+            for( typename ExecutionSpace::size_type m=0;m<a.dimension_3();m++)
+              for( typename ExecutionSpace::size_type n=0;n<a.dimension_4();n++)
+                for( typename ExecutionSpace::size_type o=0;o<a.dimension_5();o++)
+                  for( typename ExecutionSpace::size_type p=0;p<a.dimension_6();p++)
+                    for( typename ExecutionSpace::size_type q=0;q<a.dimension_7();q++)
+                      value += a(team_idx+ team_member.team_rank(),k,l,m,n,o,p,q);
+      }
+    }
+  };
+
+  template <typename Scalar, class ExecutionSpace>
+  struct test_segmented_view
+  {
+    typedef test_segmented_view<Scalar,ExecutionSpace> self_type;
+
+    typedef Scalar scalar_type;
+    typedef ExecutionSpace execution_space;
+    typedef Kokkos::TeamPolicy<execution_space> Policy;
+
+    double result;
+    double reference;
+
+    template <class ViewType>
+    void run_me(ViewType a, int max_length){
+      const int team_size = Policy::team_size_max( GrowTest<ViewType,execution_space>(a) );
+      const int nteams = max_length/team_size;
+
+      reference = 0;
+      result = 0;
+
+      Kokkos::parallel_reduce(Policy(nteams,team_size),GrowTest<ViewType,execution_space>(a),reference);
+      Kokkos::fence();
+      Kokkos::parallel_reduce(Policy(nteams,team_size),VerifyTest<ViewType,execution_space>(a),result);
+      Kokkos::fence();
+    }
+
+
+    test_segmented_view(unsigned int size,int rank)
+    {
+      reference = 0;
+      result = 0;
+
+      const int dim_1 = 7;
+      const int dim_2 = 3;
+      const int dim_3 = 2;
+      const int dim_4 = 3;
+      const int dim_5 = 2;
+      const int dim_6 = 4;
+      //const int dim_7 = 3;
+
+      if(rank==1) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*,Kokkos::LayoutLeft,ExecutionSpace> rank1_view;
+        run_me< rank1_view >(rank1_view("Rank1",128,size), size);
+      }
+      if(rank==2) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar**,Kokkos::LayoutLeft,ExecutionSpace> rank2_view;
+        run_me< rank2_view >(rank2_view("Rank2",128,size,dim_1), size);
+      }
+      if(rank==3) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*[7][3][2],Kokkos::LayoutRight,ExecutionSpace> rank3_view;
+        run_me< rank3_view >(rank3_view("Rank3",128,size), size);
+      }
+      if(rank==4) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar****,Kokkos::LayoutRight,ExecutionSpace> rank4_view;
+        run_me< rank4_view >(rank4_view("Rank4",128,size,dim_1,dim_2,dim_3), size);
+      }
+      if(rank==5) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*[7][3][2][3],Kokkos::LayoutLeft,ExecutionSpace> rank5_view;
+        run_me< rank5_view >(rank5_view("Rank5",128,size), size);
+      }
+      if(rank==6) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*****[2],Kokkos::LayoutRight,ExecutionSpace> rank6_view;
+        run_me< rank6_view >(rank6_view("Rank6",128,size,dim_1,dim_2,dim_3,dim_4), size);
+      }
+      if(rank==7) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*******,Kokkos::LayoutLeft,ExecutionSpace> rank7_view;
+        run_me< rank7_view >(rank7_view("Rank7",128,size,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6), size);
+      }
+      if(rank==8) {
+        typedef Kokkos::Experimental::SegmentedView<Scalar*****[2][4][3],Kokkos::LayoutLeft,ExecutionSpace> rank8_view;
+        run_me< rank8_view >(rank8_view("Rank8",128,size,dim_1,dim_2,dim_3,dim_4), size);
+      }
+    }
+
+   };
+
+} // namespace Impl
+
+
+
+
+template <typename Scalar, class ExecutionSpace>
+void test_segmented_view(unsigned int size)
+{
+  {
+    typedef Kokkos::Experimental::SegmentedView<Scalar*****[2][4][3],Kokkos::LayoutLeft,ExecutionSpace> view_type;
+    view_type a("A",128,size,7,3,2,3);
+    double reference;
+
+    Impl::GrowTest<view_type,ExecutionSpace> f(a);
+
+    const int team_size = Kokkos::TeamPolicy<ExecutionSpace>::team_size_max( f );
+    const int nteams = (size+team_size-1)/team_size;
+
+    Kokkos::parallel_reduce(Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),f,reference);
+
+    size_t real_size = ((size+127)/128)*128;
+
+    ASSERT_EQ(real_size,a.dimension_0());
+    ASSERT_EQ(7,a.dimension_1());
+    ASSERT_EQ(3,a.dimension_2());
+    ASSERT_EQ(2,a.dimension_3());
+    ASSERT_EQ(3,a.dimension_4());
+    ASSERT_EQ(2,a.dimension_5());
+    ASSERT_EQ(4,a.dimension_6());
+    ASSERT_EQ(3,a.dimension_7());
+    ASSERT_EQ(real_size,a.dimension(0));
+    ASSERT_EQ(7,a.dimension(1));
+    ASSERT_EQ(3,a.dimension(2));
+    ASSERT_EQ(2,a.dimension(3));
+    ASSERT_EQ(3,a.dimension(4));
+    ASSERT_EQ(2,a.dimension(5));
+    ASSERT_EQ(4,a.dimension(6));
+    ASSERT_EQ(3,a.dimension(7));
+    ASSERT_EQ(8,a.Rank);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,1);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,2);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,3);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,4);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,5);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,6);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,7);
+    ASSERT_EQ(test.reference,test.result);
+  }
+  {
+    Impl::test_segmented_view<Scalar,ExecutionSpace> test(size,8);
+    ASSERT_EQ(test.reference,test.result);
+  }
+
+}
+
+
+} // namespace Test
+
+#else
+
+template <typename Scalar, class ExecutionSpace>
+void test_segmented_view(unsigned int ) {}
+
+#endif
+
+#endif /* #ifndef KOKKOS_TEST_SEGMENTEDVIEW_HPP */
+
diff --git a/lib/kokkos/containers/unit_tests/TestSerial.cpp b/lib/kokkos/containers/unit_tests/TestSerial.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..6f00b113f96210299ddbd378d8bcaabed43d842d
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestSerial.cpp
@@ -0,0 +1,158 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if ! defined(KOKKOS_HAVE_SERIAL)
+#  error "It doesn't make sense to build this file unless the Kokkos::Serial device is enabled.  If you see this message, it probably means that there is an error in Kokkos' CMake build infrastructure."
+#else
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+#include <Kokkos_Vector.hpp>
+
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestSegmentedView.hpp>
+#include <TestComplex.hpp>
+
+#include <iomanip>
+
+namespace Test {
+
+class serial : public ::testing::Test {
+protected:
+  static void SetUpTestCase () {
+    std::cout << std::setprecision(5) << std::scientific;
+    Kokkos::Serial::initialize ();
+  }
+
+  static void TearDownTestCase () {
+    Kokkos::Serial::finalize ();
+  }
+};
+
+
+TEST_F( serial , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Serial >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Serial >();
+}
+
+TEST_F( serial, complex )
+{
+  testComplex<Kokkos::Serial> ();
+}
+
+TEST_F( serial, bitset )
+{
+  test_bitset<Kokkos::Serial> ();
+}
+
+#define SERIAL_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near ) \
+  TEST_F( serial, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) { \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_insert<Kokkos::Serial> (num_nodes, num_inserts, num_duplicates, near); \
+  }
+
+#define SERIAL_FAILED_INSERT_TEST( num_nodes, repeat )                  \
+  TEST_F( serial, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) { \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_failed_insert<Kokkos::Serial> (num_nodes);                   \
+  }
+
+#define SERIAL_ASSIGNEMENT_TEST( num_nodes, repeat )                    \
+  TEST_F( serial, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) { \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_assignement_operators<Kokkos::Serial> (num_nodes);           \
+  }
+
+#define SERIAL_DEEP_COPY( num_nodes, repeat )                           \
+  TEST_F( serial, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {    \
+    for (int i=0; i<repeat; ++i)                                        \
+      test_deep_copy<Kokkos::Serial> (num_nodes);                       \
+  }
+
+#define SERIAL_VECTOR_COMBINE_TEST( size )             \
+  TEST_F( serial, vector_combination##size##x) {                        \
+    test_vector_combinations<int,Kokkos::Serial>(size);                 \
+  }
+
+#define SERIAL_DUALVIEW_COMBINE_TEST( size )             \
+  TEST_F( serial, dualview_combination##size##x) {                      \
+    test_dualview_combinations<int,Kokkos::Serial>(size);               \
+  }
+
+#define SERIAL_SEGMENTEDVIEW_TEST( size )                               \
+  TEST_F( serial, segmentedview_##size##x) {                            \
+    test_segmented_view<double,Kokkos::Serial>(size);                   \
+  }
+
+SERIAL_INSERT_TEST(close, 100000, 90000, 100, 500, true)
+SERIAL_INSERT_TEST(far, 100000, 90000, 100, 500, false)
+SERIAL_FAILED_INSERT_TEST( 10000, 1000 )
+SERIAL_DEEP_COPY( 10000, 1 )
+
+SERIAL_VECTOR_COMBINE_TEST( 10 )
+SERIAL_VECTOR_COMBINE_TEST( 3057 )
+SERIAL_DUALVIEW_COMBINE_TEST( 10 )
+SERIAL_SEGMENTEDVIEW_TEST( 10000 )
+
+#undef SERIAL_INSERT_TEST
+#undef SERIAL_FAILED_INSERT_TEST
+#undef SERIAL_ASSIGNEMENT_TEST
+#undef SERIAL_DEEP_COPY
+#undef SERIAL_VECTOR_COMBINE_TEST
+#undef SERIAL_DUALVIEW_COMBINE_TEST
+#undef SERIAL_SEGMENTEDVIEW_TEST
+
+} // namespace test
+
+#endif // KOKKOS_HAVE_SERIAL
+
+
diff --git a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..52b45b786562efcfbaf10a4db3ac280eb644b09b
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp
@@ -0,0 +1,149 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <vector>
+
+#include <Kokkos_StaticCrsGraph.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace TestStaticCrsGraph {
+
+template< class Space >
+void run_test_graph()
+{
+  typedef Kokkos::StaticCrsGraph< unsigned , Space > dView ;
+  typedef typename dView::HostMirror hView ;
+
+  const unsigned LENGTH = 1000 ;
+  dView dx ;
+  hView hx ;
+
+  std::vector< std::vector< int > > graph( LENGTH );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    graph[i].reserve(8);
+    for ( size_t j = 0 ; j < 8 ; ++j ) {
+      graph[i].push_back( i + j * 3 );
+    }
+  }
+
+  dx = Kokkos::create_staticcrsgraph<dView>( "dx" , graph );
+    hx = Kokkos::create_mirror( dx );
+
+  ASSERT_EQ( hx.row_map.dimension_0() - 1 , LENGTH );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    const size_t begin = hx.row_map[i];
+    const size_t n = hx.row_map[i+1] - begin ;
+    ASSERT_EQ( n , graph[i].size() );
+    for ( size_t j = 0 ; j < n ; ++j ) {
+      ASSERT_EQ( (int) hx.entries( j + begin ) , graph[i][j] );
+    }
+  }
+}
+
+template< class Space >
+void run_test_graph2()
+{
+  typedef Kokkos::StaticCrsGraph< unsigned[3] , Space > dView ;
+  typedef typename dView::HostMirror hView ;
+
+  const unsigned LENGTH = 10 ;
+
+  std::vector< size_t > sizes( LENGTH );
+
+  size_t total_length = 0 ;
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    total_length += ( sizes[i] = 6 + i % 4 );
+  }
+
+  dView dx = Kokkos::create_staticcrsgraph<dView>( "test" , sizes );
+  hView hx = Kokkos::create_mirror( dx );
+  hView mx = Kokkos::create_mirror( dx );
+
+  ASSERT_EQ( (size_t) dx.row_map.dimension_0() , (size_t) LENGTH + 1 );
+  ASSERT_EQ( (size_t) hx.row_map.dimension_0() , (size_t) LENGTH + 1 );
+  ASSERT_EQ( (size_t) mx.row_map.dimension_0() , (size_t) LENGTH + 1 );
+
+  ASSERT_EQ( (size_t) dx.entries.dimension_0() , (size_t) total_length );
+  ASSERT_EQ( (size_t) hx.entries.dimension_0() , (size_t) total_length );
+  ASSERT_EQ( (size_t) mx.entries.dimension_0() , (size_t) total_length );
+
+  ASSERT_EQ( (size_t) dx.entries.dimension_1() , (size_t) 3 );
+  ASSERT_EQ( (size_t) hx.entries.dimension_1() , (size_t) 3 );
+  ASSERT_EQ( (size_t) mx.entries.dimension_1() , (size_t) 3 );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    const size_t entry_begin = hx.row_map[i];
+    const size_t entry_end   = hx.row_map[i+1];
+    for ( size_t j = entry_begin ; j < entry_end ; ++j ) {
+      hx.entries(j,0) = j + 1 ;
+      hx.entries(j,1) = j + 2 ;
+      hx.entries(j,2) = j + 3 ;
+    }
+  }
+
+  Kokkos::deep_copy( dx.entries , hx.entries );
+  Kokkos::deep_copy( mx.entries , dx.entries );
+
+  ASSERT_EQ( mx.row_map.dimension_0() , (size_t) LENGTH + 1 );
+
+  for ( size_t i = 0 ; i < LENGTH ; ++i ) {
+    const size_t entry_begin = mx.row_map[i];
+    const size_t entry_end   = mx.row_map[i+1];
+    ASSERT_EQ( ( entry_end - entry_begin ) , sizes[i] );
+    for ( size_t j = entry_begin ; j < entry_end ; ++j ) {
+      ASSERT_EQ( (size_t) mx.entries( j , 0 ) , ( j + 1 ) );
+      ASSERT_EQ( (size_t) mx.entries( j , 1 ) , ( j + 2 ) );
+      ASSERT_EQ( (size_t) mx.entries( j , 2 ) , ( j + 3 ) );
+    }
+  }
+}
+
+} /* namespace TestStaticCrsGraph */
+
+
diff --git a/lib/kokkos/containers/unit_tests/TestThreads.cpp b/lib/kokkos/containers/unit_tests/TestThreads.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..9320a114fb858e94c8b7f60c60c322857147530f
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestThreads.cpp
@@ -0,0 +1,168 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+#include <Kokkos_Bitset.hpp>
+#include <Kokkos_UnorderedMap.hpp>
+
+#include <Kokkos_Vector.hpp>
+#include <iomanip>
+
+
+//----------------------------------------------------------------------------
+#include <TestBitset.hpp>
+#include <TestUnorderedMap.hpp>
+#include <TestStaticCrsGraph.hpp>
+
+#include <TestVector.hpp>
+#include <TestDualView.hpp>
+#include <TestSegmentedView.hpp>
+
+namespace Test {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    std::cout << std::setprecision(5) << std::scientific;
+
+    unsigned num_threads = 4;
+
+    if (Kokkos::hwloc::available()) {
+      num_threads = Kokkos::hwloc::get_available_numa_count()
+                    * Kokkos::hwloc::get_available_cores_per_numa()
+                 // * Kokkos::hwloc::get_available_threads_per_core()
+                    ;
+
+    }
+
+    std::cout << "Threads: " << num_threads << std::endl;
+
+    Kokkos::Threads::initialize( num_threads );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Threads::finalize();
+  }
+};
+
+TEST_F( threads , staticcrsgraph )
+{
+  TestStaticCrsGraph::run_test_graph< Kokkos::Threads >();
+  TestStaticCrsGraph::run_test_graph2< Kokkos::Threads >();
+}
+
+/*TEST_F( threads, bitset )
+{
+  test_bitset<Kokkos::Threads>();
+}*/
+
+#define THREADS_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat, near )                                \
+  TEST_F( threads, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) {   \
+    for (int i=0; i<repeat; ++i)                                                                                \
+      test_insert<Kokkos::Threads>(num_nodes,num_inserts,num_duplicates, near);                                   \
+  }
+
+#define THREADS_FAILED_INSERT_TEST( num_nodes, repeat )                            \
+  TEST_F( threads, UnorderedMap_failed_insert_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_failed_insert<Kokkos::Threads>(num_nodes);                             \
+  }
+
+#define THREADS_ASSIGNEMENT_TEST( num_nodes, repeat )                             \
+  TEST_F( threads, UnorderedMap_assignment_operators_##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_assignement_operators<Kokkos::Threads>(num_nodes);                     \
+  }
+
+#define THREADS_DEEP_COPY( num_nodes, repeat )                             \
+  TEST_F( threads, UnorderedMap_deep_copy##num_nodes##_##repeat##x) {       \
+    for (int i=0; i<repeat; ++i)                                               \
+      test_deep_copy<Kokkos::Threads>(num_nodes);                     \
+  }
+
+#define THREADS_VECTOR_COMBINE_TEST( size )                             \
+  TEST_F( threads, vector_combination##size##x) {       \
+      test_vector_combinations<int,Kokkos::Threads>(size);                     \
+  }
+
+#define THREADS_DUALVIEW_COMBINE_TEST( size )                             \
+  TEST_F( threads, dualview_combination##size##x) {       \
+      test_dualview_combinations<int,Kokkos::Threads>(size);                     \
+  }
+
+#define THREADS_SEGMENTEDVIEW_TEST( size )                             \
+  TEST_F( threads, segmentedview_##size##x) {       \
+      test_segmented_view<double,Kokkos::Threads>(size);                     \
+  }
+
+
+THREADS_INSERT_TEST(far, 100000, 90000, 100, 500, false)
+THREADS_FAILED_INSERT_TEST( 10000, 1000 )
+THREADS_DEEP_COPY( 10000, 1 )
+
+THREADS_VECTOR_COMBINE_TEST( 10 )
+THREADS_VECTOR_COMBINE_TEST( 3057 )
+THREADS_DUALVIEW_COMBINE_TEST( 10 )
+THREADS_SEGMENTEDVIEW_TEST( 10000 )
+
+
+#undef THREADS_INSERT_TEST
+#undef THREADS_FAILED_INSERT_TEST
+#undef THREADS_ASSIGNEMENT_TEST
+#undef THREADS_DEEP_COPY
+#undef THREADS_VECTOR_COMBINE_TEST
+#undef THREADS_DUALVIEW_COMBINE_TEST
+#undef THREADS_SEGMENTEDVIEW_TEST
+
+} // namespace Test
+
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+
diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..ff0328548dee0a3458faa82ab44a16e5a081d29b
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp
@@ -0,0 +1,313 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_UNORDERED_MAP_HPP
+#define KOKKOS_TEST_UNORDERED_MAP_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+
+
+namespace Test {
+
+namespace Impl {
+
+template <typename MapType, bool Near = false>
+struct TestInsert
+{
+  typedef MapType map_type;
+  typedef typename map_type::execution_space execution_space;
+  typedef uint32_t value_type;
+
+  map_type map;
+  uint32_t inserts;
+  uint32_t collisions;
+
+  TestInsert( map_type arg_map, uint32_t arg_inserts, uint32_t arg_collisions)
+    : map(arg_map)
+    , inserts(arg_inserts)
+    , collisions(arg_collisions)
+  {}
+
+  void testit( bool rehash_on_fail = true )
+  {
+    execution_space::fence();
+
+    uint32_t failed_count = 0;
+    do {
+      failed_count = 0;
+      Kokkos::parallel_reduce(inserts, *this, failed_count);
+
+      if (rehash_on_fail && failed_count > 0u) {
+        const uint32_t new_capacity = map.capacity() + ((map.capacity()*3ull)/20u) + failed_count/collisions ;
+        map.rehash( new_capacity );
+      }
+    } while (rehash_on_fail && failed_count > 0u);
+
+    execution_space::fence();
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & failed_count ) const { failed_count = 0; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & failed_count, const volatile value_type & count ) const
+  { failed_count += count; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(uint32_t i, value_type & failed_count) const
+  {
+    const uint32_t key = Near ? i/collisions : i%(inserts/collisions);
+    if (map.insert(key,i).failed()) ++failed_count;
+  }
+
+};
+
+  template <typename MapType, bool Near>
+  struct TestErase
+  {
+    typedef TestErase<MapType, Near> self_type;
+
+    typedef MapType map_type;
+    typedef typename MapType::execution_space execution_space;
+
+    map_type m_map;
+    uint32_t m_num_erase;
+    uint32_t m_num_duplicates;
+
+    TestErase(map_type map, uint32_t num_erases, uint32_t num_duplicates)
+      : m_map(map)
+      , m_num_erase(num_erases)
+      , m_num_duplicates(num_duplicates)
+    {}
+
+    void testit()
+    {
+      execution_space::fence();
+      Kokkos::parallel_for(m_num_erase, *this);
+      execution_space::fence();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(typename execution_space::size_type i) const
+    {
+      if (Near) {
+        m_map.erase(i/m_num_duplicates);
+      }
+      else {
+        m_map.erase(i%(m_num_erase/m_num_duplicates));
+      }
+
+    }
+  };
+
+  template <typename MapType>
+  struct TestFind
+  {
+    typedef MapType map_type;
+    typedef typename MapType::execution_space::execution_space execution_space;
+    typedef uint32_t value_type;
+
+    map_type m_map;
+    uint32_t m_num_insert;
+    uint32_t m_num_duplicates;
+    uint32_t m_max_key;
+
+    TestFind(map_type map, uint32_t num_inserts, uint32_t num_duplicates)
+      : m_map(map)
+      , m_num_insert(num_inserts)
+      , m_num_duplicates(num_duplicates)
+      , m_max_key( ((num_inserts + num_duplicates) - 1)/num_duplicates )
+    {}
+
+    void testit(value_type &errors)
+    {
+      execution_space::execution_space::fence();
+      Kokkos::parallel_reduce(m_map.capacity(), *this, errors);
+      execution_space::execution_space::fence();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    static void init( value_type & dst)
+    {
+      dst = 0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    static void join( volatile value_type & dst, const volatile value_type & src)
+    { dst += src; }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(typename execution_space::size_type i, value_type & errors) const
+    {
+      const bool expect_to_find_i = (i < m_max_key);
+
+      const bool exists = m_map.exists(i);
+
+      if (expect_to_find_i && !exists)  ++errors;
+      if (!expect_to_find_i && exists)  ++errors;
+    }
+  };
+
+} // namespace Impl
+
+
+
+template <typename Device>
+void test_insert( uint32_t num_nodes , uint32_t num_inserts , uint32_t num_duplicates , bool near )
+{
+  typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type;
+  typedef Kokkos::UnorderedMap<const uint32_t,const uint32_t, Device> const_map_type;
+
+  const uint32_t expected_inserts = (num_inserts + num_duplicates -1u) / num_duplicates;
+
+  map_type map;
+  map.rehash(num_nodes,false);
+
+  if (near) {
+    Impl::TestInsert<map_type,true> test_insert(map, num_inserts, num_duplicates);
+    test_insert.testit();
+  } else
+  {
+    Impl::TestInsert<map_type,false> test_insert(map, num_inserts, num_duplicates);
+    test_insert.testit();
+  }
+
+  const bool print_list = false;
+  if (print_list) {
+    Kokkos::Impl::UnorderedMapPrint<map_type> f(map);
+    f.apply();
+  }
+
+  const uint32_t map_size = map.size();
+
+  ASSERT_FALSE( map.failed_insert());
+  {
+    EXPECT_EQ(expected_inserts, map_size);
+
+    {
+      uint32_t find_errors = 0;
+      Impl::TestFind<const_map_type> test_find(map, num_inserts, num_duplicates);
+      test_find.testit(find_errors);
+      EXPECT_EQ( 0u, find_errors);
+    }
+
+    map.begin_erase();
+    Impl::TestErase<map_type,false> test_erase(map, num_inserts, num_duplicates);
+    test_erase.testit();
+    map.end_erase();
+    EXPECT_EQ(0u, map.size());
+  }
+}
+
+template <typename Device>
+void test_failed_insert( uint32_t num_nodes)
+{
+  typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type;
+
+  map_type map(num_nodes);
+  Impl::TestInsert<map_type> test_insert(map, 2u*num_nodes, 1u);
+  test_insert.testit(false /*don't rehash on fail*/);
+  Device::execution_space::fence();
+
+  EXPECT_TRUE( map.failed_insert() );
+}
+
+
+
+template <typename Device>
+void test_deep_copy( uint32_t num_nodes )
+{
+  typedef Kokkos::UnorderedMap<uint32_t,uint32_t, Device> map_type;
+  typedef Kokkos::UnorderedMap<const uint32_t, const uint32_t, Device> const_map_type;
+
+  typedef typename map_type::HostMirror host_map_type ;
+  // typedef Kokkos::UnorderedMap<uint32_t, uint32_t, typename Device::host_mirror_execution_space > host_map_type;
+
+  map_type map;
+  map.rehash(num_nodes,false);
+
+  {
+    Impl::TestInsert<map_type> test_insert(map, num_nodes, 1);
+    test_insert.testit();
+    ASSERT_EQ( map.size(), num_nodes);
+    ASSERT_FALSE( map.failed_insert() );
+    {
+      uint32_t find_errors = 0;
+      Impl::TestFind<map_type> test_find(map, num_nodes, 1);
+      test_find.testit(find_errors);
+      EXPECT_EQ( find_errors, 0u);
+    }
+
+  }
+
+  host_map_type hmap;
+  Kokkos::deep_copy(hmap, map);
+
+  ASSERT_EQ( map.size(), hmap.size());
+  ASSERT_EQ( map.capacity(), hmap.capacity());
+  {
+    uint32_t find_errors = 0;
+    Impl::TestFind<host_map_type> test_find(hmap, num_nodes, 1);
+    test_find.testit(find_errors);
+    EXPECT_EQ( find_errors, 0u);
+  }
+
+  map_type mmap;
+  Kokkos::deep_copy(mmap, hmap);
+
+  const_map_type cmap = mmap;
+
+  EXPECT_EQ( cmap.size(), num_nodes);
+
+  {
+    uint32_t find_errors = 0;
+    Impl::TestFind<const_map_type> test_find(cmap, num_nodes, 1);
+    test_find.testit(find_errors);
+    EXPECT_EQ( find_errors, 0u);
+  }
+
+}
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/containers/unit_tests/TestVector.hpp b/lib/kokkos/containers/unit_tests/TestVector.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..f9f4564898edf32e0030d0ca135ff9f43909f397
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/TestVector.hpp
@@ -0,0 +1,131 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef KOKKOS_TEST_VECTOR_HPP
+#define KOKKOS_TEST_VECTOR_HPP
+
+#include <gtest/gtest.h>
+#include <iostream>
+#include <cstdlib>
+#include <cstdio>
+#include <impl/Kokkos_Timer.hpp>
+
+namespace Test {
+
+namespace Impl {
+
+  template <typename Scalar, class Device>
+  struct test_vector_combinations
+  {
+    typedef test_vector_combinations<Scalar,Device> self_type;
+
+    typedef Scalar scalar_type;
+    typedef Device execution_space;
+
+    Scalar reference;
+    Scalar result;
+
+    template <typename Vector>
+    Scalar run_me(unsigned int n){
+      Vector a(n,1);
+
+
+      a.push_back(2);
+      a.resize(n+4);
+      a[n+1] = 3;
+      a[n+2] = 4;
+      a[n+3] = 5;
+
+
+      Scalar temp1 = a[2];
+      Scalar temp2 = a[n];
+      Scalar temp3 = a[n+1];
+
+      a.assign(n+2,-1);
+
+      a[2] = temp1;
+      a[n] = temp2;
+      a[n+1] = temp3;
+
+      Scalar test1 = 0;
+      for(unsigned int i=0; i<a.size(); i++)
+        test1+=a[i];
+
+      a.assign(n+1,-2);
+      Scalar test2 = 0;
+      for(unsigned int i=0; i<a.size(); i++)
+        test2+=a[i];
+
+      a.reserve(n+10);
+
+      Scalar test3 = 0;
+      for(unsigned int i=0; i<a.size(); i++)
+        test3+=a[i];
+
+
+      return (test1*test2+test3)*test2+test1*test3;
+    }
+
+
+    test_vector_combinations(unsigned int size)
+    {
+      reference = run_me<std::vector<Scalar> >(size);
+      result = run_me<Kokkos::vector<Scalar,Device> >(size);
+    }
+
+   };
+
+} // namespace Impl
+
+
+
+
+template <typename Scalar, typename Device>
+void test_vector_combinations(unsigned int size)
+{
+  Impl::test_vector_combinations<Scalar,Device> test(size);
+  ASSERT_EQ( test.reference, test.result);
+}
+
+
+} // namespace Test
+
+#endif //KOKKOS_TEST_UNORDERED_MAP_HPP
diff --git a/lib/kokkos/containers/unit_tests/UnitTestMain.cpp b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87
--- /dev/null
+++ b/lib/kokkos/containers/unit_tests/UnitTestMain.cpp
@@ -0,0 +1,50 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile
new file mode 100755
index 0000000000000000000000000000000000000000..2bf189a22f7227084cd02fc28c0ddf591d7e8fe8
--- /dev/null
+++ b/lib/kokkos/core/perf_test/Makefile
@@ -0,0 +1,66 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/core/perf_test
+
+default: build_all
+	echo "End Build"
+	
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = nvcc_wrapper
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?=  -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/perf_test
+
+TEST_TARGETS = 
+TARGETS = 
+
+OBJ_PERF = PerfTestHost.o PerfTestCuda.o PerfTestMain.o gtest-all.o
+TARGETS += KokkosCore_PerformanceTest
+TEST_TARGETS += test-performance
+
+OBJ_ATOMICS = test_atomic.o 
+TARGETS += KokkosCore_PerformanceTest_Atomics
+TEST_TARGETS += test-atomic
+
+
+KokkosCore_PerformanceTest: $(OBJ_PERF) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_PERF) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest
+
+KokkosCore_PerformanceTest_Atomics: $(OBJ_ATOMICS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ATOMICS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_PerformanceTest_Atomics
+
+test-performance: KokkosCore_PerformanceTest
+	./KokkosCore_PerformanceTest
+
+test-atomic: KokkosCore_PerformanceTest_Atomics
+	./KokkosCore_PerformanceTest_Atomics
+	
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+	
+clean: kokkos-clean 
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp b/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..aa4046cbf047defd47a89141d960ad330622d9b7
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestBlasKernels.hpp
@@ -0,0 +1,309 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BLAS_KERNELS_HPP
+#define KOKKOS_BLAS_KERNELS_HPP
+
+namespace Kokkos {
+
+template< class ConstVectorType ,
+          class Device = typename ConstVectorType::execution_space >
+struct Dot ;
+
+template< class ConstVectorType ,
+          class Device = typename ConstVectorType::execution_space >
+struct DotSingle ;
+
+template< class ConstScalarType ,
+          class VectorType ,
+          class Device = typename VectorType::execution_space >
+struct Scale ;
+
+template< class ConstScalarType ,
+          class ConstVectorType ,
+          class VectorType ,
+          class Device = typename VectorType::execution_space >
+struct AXPBY ;
+
+/** \brief  Y = alpha * X + beta * Y */
+template< class ConstScalarType ,
+          class ConstVectorType ,
+          class      VectorType >
+void axpby( const ConstScalarType & alpha ,
+            const ConstVectorType & X ,
+            const ConstScalarType & beta ,
+            const      VectorType & Y )
+{
+  typedef AXPBY< ConstScalarType , ConstVectorType , VectorType > functor ;
+
+  parallel_for( Y.dimension_0() , functor( alpha , X , beta , Y ) );
+}
+
+/** \brief  Y *= alpha */
+template< class ConstScalarType ,
+          class      VectorType >
+void scale( const ConstScalarType & alpha , const VectorType & Y )
+{
+  typedef Scale< ConstScalarType , VectorType > functor ;
+
+  parallel_for( Y.dimension_0() , functor( alpha , Y ) );
+}
+
+template< class ConstVectorType ,
+          class Finalize >
+void dot( const ConstVectorType & X ,
+          const ConstVectorType & Y ,
+          const Finalize & finalize )
+{
+  typedef Dot< ConstVectorType >  functor ;
+
+  parallel_reduce( X.dimension_0() , functor( X , Y ) , finalize );
+}
+
+template< class ConstVectorType ,
+          class Finalize >
+void dot( const ConstVectorType & X ,
+          const Finalize & finalize )
+{
+  typedef DotSingle< ConstVectorType >  functor ;
+
+  parallel_reduce( X.dimension_0() , functor( X ) , finalize );
+}
+
+} /* namespace Kokkos */
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class Type , class Device >
+struct Dot
+{
+  typedef typename Device::execution_space execution_space ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< Type::Rank > >::type ok_rank ;
+
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename Type::execution_space >::type ok_device ;*/
+
+  typedef double value_type ;
+
+#if 1
+  typename Type::const_type X ;
+  typename Type::const_type Y ;
+#else
+  Type X ;
+  Type Y ;
+#endif
+
+  Dot( const Type & arg_x , const Type & arg_y )
+    : X(arg_x) , Y(arg_y) { }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , value_type & update ) const
+    { update += X[i] * Y[i]; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+    { update += source; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+};
+
+template< class Type , class Device >
+struct DotSingle
+{
+  typedef typename Device::execution_space execution_space ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< Type::Rank > >::type ok_rank ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename Type::execution_space >::type ok_device ;*/
+
+  typedef double value_type ;
+
+#if 1
+  typename Type::const_type X ;
+#else
+  Type X ;
+#endif
+
+  DotSingle( const Type & arg_x ) : X(arg_x) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i , value_type & update ) const
+    {
+      const typename Type::value_type & x = X[i]; update += x * x ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & source )
+    { update += source; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+};
+
+
+template< class ScalarType , class VectorType , class Device>
+struct Scale
+{
+  typedef typename Device::execution_space execution_space ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ScalarType::execution_space >::type
+      ok_scalar_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename VectorType::execution_space >::type
+      ok_vector_device ;*/
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
+                            Impl::unsigned_< ScalarType::Rank > >::type
+      ok_scalar_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< VectorType::Rank > >::type
+      ok_vector_rank ;
+
+#if 1
+  typename ScalarType::const_type alpha ;
+#else
+  ScalarType alpha ;
+#endif
+
+  VectorType Y ;
+
+  Scale( const ScalarType & arg_alpha , const VectorType & arg_Y )
+    : alpha( arg_alpha ), Y( arg_Y ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const
+    {
+      Y[i] *= alpha() ;
+    }
+};
+
+
+template< class ScalarType ,
+          class ConstVectorType ,
+          class VectorType,
+          class Device>
+struct AXPBY
+{
+  typedef typename Device::execution_space execution_space ;
+
+/*  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ScalarType::execution_space >::type
+      ok_scalar_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename ConstVectorType::execution_space >::type
+      ok_const_vector_device ;
+
+  typedef typename
+    Impl::StaticAssertSame< execution_space ,
+                            typename VectorType::execution_space >::type
+      ok_vector_device ;*/
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 0 > ,
+                            Impl::unsigned_< ScalarType::Rank > >::type
+      ok_scalar_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< ConstVectorType::Rank > >::type
+      ok_const_vector_rank ;
+
+  typedef typename
+    Impl::StaticAssertSame< Impl::unsigned_< 1 > ,
+                            Impl::unsigned_< VectorType::Rank > >::type
+      ok_vector_rank ;
+
+#if 1
+  typename ScalarType::const_type alpha , beta ;
+  typename ConstVectorType::const_type X ;
+#else
+  ScalarType alpha , beta ;
+  ConstVectorType X ;
+#endif
+
+  VectorType Y ;
+
+  AXPBY( const ScalarType      & arg_alpha ,
+         const ConstVectorType & arg_X ,
+         const ScalarType      & arg_beta ,
+         const VectorType      & arg_Y )
+    : alpha( arg_alpha ), beta( arg_beta ), X( arg_X ), Y( arg_Y ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( int i ) const
+    {
+      Y[i] = alpha() * X[i] + beta() * Y[i] ;
+    }
+};
+
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_BLAS_KERNELS_HPP */
diff --git a/lib/kokkos/core/perf_test/PerfTestCuda.cpp b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..28e654bb700cb4f6fa1b75636ab38f5c8fdf7326
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestCuda.cpp
@@ -0,0 +1,189 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <iomanip>
+#include <algorithm>
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <PerfTestHexGrad.hpp>
+#include <PerfTestBlasKernels.hpp>
+#include <PerfTestGramSchmidt.hpp>
+#include <PerfTestDriver.hpp>
+
+
+namespace Test {
+
+class cuda : public ::testing::Test {
+  protected:
+    static void SetUpTestCase() {
+      Kokkos::HostSpace::execution_space::initialize();
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+    }
+    static void TearDownTestCase() {
+      Kokkos::Cuda::finalize();
+      Kokkos::HostSpace::execution_space::finalize();
+    }
+};
+
+TEST_F( cuda, hexgrad )
+{
+  EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
+}
+
+TEST_F( cuda, gramschmidt )
+{
+  EXPECT_NO_THROW( run_test_gramschmidt< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
+}
+
+namespace {
+
+template <typename T>
+struct TextureFetch
+{
+  typedef Kokkos::View< T *, Kokkos::CudaSpace> array_type;
+  typedef Kokkos::View< const T *, Kokkos::CudaSpace, Kokkos::MemoryRandomAccess> const_array_type;
+  typedef Kokkos::View< int *, Kokkos::CudaSpace> index_array_type;
+  typedef Kokkos::View< const int *, Kokkos::CudaSpace> const_index_array_type;
+
+  struct FillArray
+  {
+    array_type m_array;
+    FillArray( const array_type & array )
+      : m_array(array)
+    {}
+
+    void apply() const
+    {
+      Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.size()), *this);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(int i) const { m_array(i) = i; }
+  };
+
+  struct RandomIndexes
+  {
+    index_array_type m_indexes;
+    typename index_array_type::HostMirror m_host_indexes;
+    RandomIndexes( const index_array_type & indexes)
+      : m_indexes(indexes)
+      , m_host_indexes(Kokkos::create_mirror(m_indexes))
+    {}
+
+    void apply() const
+    {
+      Kokkos::parallel_for( Kokkos::RangePolicy<Kokkos::HostSpace::execution_space,int>(0,m_host_indexes.size()), *this);
+      //random shuffle
+      Kokkos::HostSpace::execution_space::fence();
+      std::random_shuffle(m_host_indexes.ptr_on_device(), m_host_indexes.ptr_on_device() + m_host_indexes.size());
+      Kokkos::deep_copy(m_indexes,m_host_indexes);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(int i) const { m_host_indexes(i) = i; }
+  };
+
+  struct RandomReduce
+  {
+    const_array_type       m_array;
+    const_index_array_type m_indexes;
+    RandomReduce( const const_array_type & array, const const_index_array_type & indexes)
+      : m_array(array)
+      , m_indexes(indexes)
+    {}
+
+    void apply(T & reduce) const
+    {
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<Kokkos::Cuda,int>(0,m_array.size()), *this, reduce);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()(int i, T & reduce) const
+    { reduce += m_array(m_indexes(i)); }
+  };
+
+  static void run(int size, double & reduce_time, T &reduce)
+  {
+    array_type array("array",size);
+    index_array_type indexes("indexes",size);
+
+    { FillArray f(array); f.apply(); }
+    { RandomIndexes f(indexes); f.apply(); }
+
+    Kokkos::Cuda::fence();
+
+    Kokkos::Impl::Timer timer;
+    for (int j=0; j<10; ++j) {
+      RandomReduce f(array,indexes);
+      f.apply(reduce);
+    }
+    Kokkos::Cuda::fence();
+    reduce_time = timer.seconds();
+  }
+};
+
+} // unnamed namespace
+
+TEST_F( cuda, texture_double )
+{
+  printf("Random reduce of double through texture fetch\n");
+  for (int i=1; i<=27; ++i) {
+    int size = 1<<i;
+    double time = 0;
+    double reduce = 0;
+    TextureFetch<double>::run(size,time,reduce);
+    printf("   time = %1.3e   size = 2^%d\n", time, i);
+  }
+}
+
+} // namespace Test
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+
diff --git a/lib/kokkos/core/perf_test/PerfTestDriver.hpp b/lib/kokkos/core/perf_test/PerfTestDriver.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..e3dd3b4123a2dae6fd4f69f77a046796f9c040c8
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestDriver.hpp
@@ -0,0 +1,152 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <iostream>
+#include <string>
+
+// mfh 06 Jun 2013: This macro doesn't work like one might thing it
+// should.  It doesn't take the template parameter DeviceType and
+// print its actual type name; it just literally prints out
+// "DeviceType".  I've worked around this below without using the
+// macro, so I'm commenting out the macro to avoid compiler complaints
+// about an unused macro.
+
+// #define KOKKOS_MACRO_IMPL_TO_STRING( X ) #X
+// #define KOKKOS_MACRO_TO_STRING( X )  KOKKOS_MACRO_IMPL_TO_STRING( X )
+
+//------------------------------------------------------------------------
+
+namespace Test {
+
+enum { NUMBER_OF_TRIALS = 5 };
+
+
+
+template< class DeviceType >
+void run_test_hexgrad( int exp_beg , int exp_end, const char deviceTypeName[] )
+{
+  std::string label_hexgrad ;
+  label_hexgrad.append( "\"HexGrad< double , " );
+  // mfh 06 Jun 2013: This only appends "DeviceType" (literally) to
+  // the string, not the actual name of the device type.  Thus, I've
+  // modified the function to take the name of the device type.
+  //
+  //label_hexgrad.append( KOKKOS_MACRO_TO_STRING( DeviceType ) );
+  label_hexgrad.append( deviceTypeName );
+  label_hexgrad.append( " >\"" );
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    double min_seconds = 0.0 ;
+    double max_seconds = 0.0 ;
+    double avg_seconds = 0.0 ;
+
+    const int parallel_work_length = 1<<i;
+
+    for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) {
+      const double seconds = HexGrad< DeviceType >::test(parallel_work_length) ;
+
+      if ( 0 == j ) {
+        min_seconds = seconds ;
+        max_seconds = seconds ;
+      }
+      else {
+        if ( seconds < min_seconds ) min_seconds = seconds ;
+        if ( seconds > max_seconds ) max_seconds = seconds ;
+      }
+      avg_seconds += seconds ;
+    }
+    avg_seconds /= NUMBER_OF_TRIALS ;
+
+    std::cout << label_hexgrad
+      << " , " << parallel_work_length
+      << " , " << min_seconds
+      << " , " << ( min_seconds / parallel_work_length )
+      << std::endl ;
+  }
+}
+
+template< class DeviceType >
+void run_test_gramschmidt( int exp_beg , int exp_end, const char deviceTypeName[] )
+{
+  std::string label_gramschmidt ;
+  label_gramschmidt.append( "\"GramSchmidt< double , " );
+  // mfh 06 Jun 2013: This only appends "DeviceType" (literally) to
+  // the string, not the actual name of the device type.  Thus, I've
+  // modified the function to take the name of the device type.
+  //
+  //label_gramschmidt.append( KOKKOS_MACRO_TO_STRING( DeviceType ) );
+  label_gramschmidt.append( deviceTypeName );
+  label_gramschmidt.append( " >\"" );
+
+  for (int i = exp_beg ; i < exp_end ; ++i) {
+    double min_seconds = 0.0 ;
+    double max_seconds = 0.0 ;
+    double avg_seconds = 0.0 ;
+
+    const int parallel_work_length = 1<<i;
+
+    for ( int j = 0 ; j < NUMBER_OF_TRIALS ; ++j ) {
+      const double seconds = ModifiedGramSchmidt< double , DeviceType >::test(parallel_work_length, 32 ) ;
+
+      if ( 0 == j ) {
+        min_seconds = seconds ;
+        max_seconds = seconds ;
+      }
+      else {
+        if ( seconds < min_seconds ) min_seconds = seconds ;
+        if ( seconds > max_seconds ) max_seconds = seconds ;
+      }
+      avg_seconds += seconds ;
+    }
+    avg_seconds /= NUMBER_OF_TRIALS ;
+
+    std::cout << label_gramschmidt
+      << " , " << parallel_work_length
+      << " , " << min_seconds
+      << " , " << ( min_seconds / parallel_work_length )
+      << std::endl ;
+  }
+}
+
+}
+
diff --git a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..292e09cc4a69783278d536a713e2d9df19b4d6c1
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.hpp
@@ -0,0 +1,231 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cmath>
+#include <PerfTestBlasKernels.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+// Reduction   : result = dot( Q(:,j) , Q(:,j) );
+// PostProcess : R(j,j) = result ; inv = 1 / result ;
+template< class VectorView , class ValueView  >
+struct InvNorm2 : public Kokkos::DotSingle< VectorView > {
+
+  typedef typename Kokkos::DotSingle< VectorView >::value_type value_type ;
+
+  ValueView  Rjj ;
+  ValueView  inv ;
+
+  InvNorm2( const VectorView & argX ,
+            const ValueView  & argR ,
+            const ValueView  & argInv )
+    : Kokkos::DotSingle< VectorView >( argX )
+    , Rjj( argR )
+    , inv( argInv )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & result ) const
+  {
+    result = sqrt( result );
+    Rjj() = result ;
+    inv() = ( 0 < result ) ? 1.0 / result : 0 ;
+  }
+};
+
+template< class VectorView , class ValueView >
+inline
+void invnorm2( const VectorView & x ,
+               const ValueView  & r ,
+               const ValueView  & r_inv )
+{
+  Kokkos::parallel_reduce( x.dimension_0() , InvNorm2< VectorView , ValueView >( x , r , r_inv ) );
+}
+
+// PostProcess : tmp = - ( R(j,k) = result );
+template< class VectorView , class ValueView  >
+struct DotM : public Kokkos::Dot< VectorView > {
+
+  typedef typename Kokkos::Dot< VectorView >::value_type value_type ;
+
+  ValueView  Rjk ;
+  ValueView  tmp ;
+
+  DotM( const VectorView & argX ,
+        const VectorView & argY ,
+        const ValueView & argR ,
+        const ValueView & argTmp )
+    : Kokkos::Dot< VectorView >( argX , argY )
+    , Rjk( argR )
+    , tmp( argTmp )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & result ) const
+  {
+     Rjk()  = result ;
+     tmp()  = - result ;
+  }
+};
+
+template< class VectorView , class ValueView >
+inline
+void dot_neg( const VectorView & x ,
+              const VectorView & y ,
+              const ValueView  & r ,
+              const ValueView  & r_neg )
+{
+  Kokkos::parallel_reduce( x.dimension_0() , DotM< VectorView , ValueView >( x , y , r , r_neg ) );
+}
+
+
+template< typename Scalar , class DeviceType >
+struct ModifiedGramSchmidt
+{
+  typedef DeviceType  execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef Kokkos::View< Scalar** ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > multivector_type ;
+
+  typedef Kokkos::View< Scalar* ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > vector_type ;
+
+  typedef Kokkos::View< Scalar ,
+                        Kokkos::LayoutLeft ,
+                        execution_space > value_view ;
+
+
+  multivector_type Q ;
+  multivector_type R ;
+
+  static double factorization( const multivector_type Q_ ,
+                               const multivector_type R_ )
+  {
+#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+    using Kokkos::Experimental::ALL ;
+#else
+    const Kokkos::ALL ALL ;
+#endif
+    const size_type count  = Q_.dimension_1();
+    value_view tmp("tmp");
+    value_view one("one");
+
+    Kokkos::deep_copy( one , (Scalar) 1 );
+
+    Kokkos::Impl::Timer timer ;
+
+    for ( size_type j = 0 ; j < count ; ++j ) {
+      // Reduction   : tmp = dot( Q(:,j) , Q(:,j) );
+      // PostProcess : tmp = sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ;
+      const vector_type Qj  = Kokkos::subview( Q_ , ALL , j );
+      const value_view  Rjj = Kokkos::subview( R_ , j , j );
+
+      invnorm2( Qj , Rjj , tmp );
+
+      // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ;
+      Kokkos::scale( tmp , Qj );
+
+      for ( size_t k = j + 1 ; k < count ; ++k ) {
+        const vector_type Qk = Kokkos::subview( Q_ , ALL , k );
+        const value_view  Rjk = Kokkos::subview( R_ , j , k );
+
+        // Reduction   : R(j,k) = dot( Q(:,j) , Q(:,k) );
+        // PostProcess : tmp = - R(j,k);
+        dot_neg( Qj , Qk , Rjk , tmp );
+
+        // Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j)
+        Kokkos::axpby( tmp , Qj , one , Qk );
+      }
+    }
+
+    execution_space::fence();
+
+    return timer.seconds();
+  }
+
+  //--------------------------------------------------------------------------
+
+  static double test( const size_t length ,
+                      const size_t count ,
+                      const size_t iter = 1 )
+  {
+    multivector_type Q_( "Q" , length , count );
+    multivector_type R_( "R" , count , count );
+
+    typename multivector_type::HostMirror A =
+      Kokkos::create_mirror( Q_ );
+
+    // Create and fill A on the host
+
+    for ( size_type j = 0 ; j < count ; ++j ) {
+      for ( size_type i = 0 ; i < length ; ++i ) {
+        A(i,j) = ( i + 1 ) * ( j + 1 );
+      }
+    }
+
+    double dt_min = 0 ;
+
+    for ( size_t i = 0 ; i < iter ; ++i ) {
+
+      Kokkos::deep_copy( Q_ , A );
+
+      // A = Q * R
+
+      const double dt = factorization( Q_ , R_ );
+
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+    }
+
+    return dt_min ;
+  }
+};
+
+}
+
diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..d13d9a49e800b8064852c174755c4eea3a94be4b
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.hpp
@@ -0,0 +1,268 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+namespace Test {
+
+template< class DeviceType ,
+          typename CoordScalarType = double ,
+          typename GradScalarType  = float >
+struct HexGrad
+{
+  typedef DeviceType execution_space ;
+  typedef typename execution_space::size_type  size_type ;
+
+  typedef HexGrad<DeviceType,CoordScalarType,GradScalarType> self_type;
+
+  // 3D array : ( ParallelWork , Space , Node )
+
+  enum { NSpace = 3 , NNode = 8 };
+
+  typedef Kokkos::View< CoordScalarType*[NSpace][NNode] , execution_space >
+    elem_coord_type ;
+
+  typedef Kokkos::View< GradScalarType*[NSpace][NNode] , execution_space >
+    elem_grad_type ;
+
+  elem_coord_type  coords ;
+  elem_grad_type   grad_op ;
+
+  enum { FLOPS  = 318 }; // = 3 * ( 18 + 8 * 11 ) };
+  enum { READS  = 18 };
+  enum { WRITES = 18 };
+
+  HexGrad( const elem_coord_type  & arg_coords ,
+           const elem_grad_type   & arg_grad_op )
+    : coords( arg_coords )
+    , grad_op( arg_grad_op )
+    {}
+
+  KOKKOS_INLINE_FUNCTION static
+  void grad( const CoordScalarType x[] ,
+             const CoordScalarType z[] ,
+                   GradScalarType grad_y[] )
+  {
+    const GradScalarType R42=(x[3] - x[1]);
+    const GradScalarType R52=(x[4] - x[1]);
+    const GradScalarType R54=(x[4] - x[3]);
+
+    const GradScalarType R63=(x[5] - x[2]);
+    const GradScalarType R83=(x[7] - x[2]);
+    const GradScalarType R86=(x[7] - x[5]);
+
+    const GradScalarType R31=(x[2] - x[0]);
+    const GradScalarType R61=(x[5] - x[0]);
+    const GradScalarType R74=(x[6] - x[3]);
+
+    const GradScalarType R72=(x[6] - x[1]);
+    const GradScalarType R75=(x[6] - x[4]);
+    const GradScalarType R81=(x[7] - x[0]);
+
+    const GradScalarType t1=(R63 + R54);
+    const GradScalarType t2=(R61 + R74);
+    const GradScalarType t3=(R72 + R81);
+
+    const GradScalarType t4 =(R86 + R42);
+    const GradScalarType t5 =(R83 + R52);
+    const GradScalarType t6 =(R75 + R31);
+
+    //  Calculate Y gradient from X and Z data
+
+    grad_y[0] = (z[1] *  t1) - (z[2] * R42) - (z[3] *  t5)  + (z[4] *  t4) + (z[5] * R52) - (z[7] * R54);
+    grad_y[1] = (z[2] *  t2) + (z[3] * R31) - (z[0] *  t1)  - (z[5] *  t6) + (z[6] * R63) - (z[4] * R61);
+    grad_y[2] = (z[3] *  t3) + (z[0] * R42) - (z[1] *  t2)  - (z[6] *  t4) + (z[7] * R74) - (z[5] * R72);
+    grad_y[3] = (z[0] *  t5) - (z[1] * R31) - (z[2] *  t3)  + (z[7] *  t6) + (z[4] * R81) - (z[6] * R83);
+    grad_y[4] = (z[5] *  t3) + (z[6] * R86) - (z[7] *  t2)  - (z[0] *  t4) - (z[3] * R81) + (z[1] * R61);
+    grad_y[5] = (z[6] *  t5) - (z[4] *  t3)  - (z[7] * R75) + (z[1] *  t6) - (z[0] * R52) + (z[2] * R72);
+    grad_y[6] = (z[7] *  t1) - (z[5] *  t5)  - (z[4] * R86) + (z[2] *  t4) - (z[1] * R63) + (z[3] * R83);
+    grad_y[7] = (z[4] *  t2) - (z[6] *  t1)  + (z[5] * R75) - (z[3] *  t6) - (z[2] * R74) + (z[0] * R54);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type ielem ) const
+  {
+    GradScalarType g[NNode] ;
+
+    const CoordScalarType x[NNode] = {
+      coords(ielem,0,0),
+      coords(ielem,0,1),
+      coords(ielem,0,2),
+      coords(ielem,0,3),
+      coords(ielem,0,4),
+      coords(ielem,0,5),
+      coords(ielem,0,6),
+      coords(ielem,0,7)
+    };
+
+    const CoordScalarType y[NNode] = {
+      coords(ielem,1,0),
+      coords(ielem,1,1),
+      coords(ielem,1,2),
+      coords(ielem,1,3),
+      coords(ielem,1,4),
+      coords(ielem,1,5),
+      coords(ielem,1,6),
+      coords(ielem,1,7)
+    };
+
+    const CoordScalarType z[NNode] = {
+      coords(ielem,2,0),
+      coords(ielem,2,1),
+      coords(ielem,2,2),
+      coords(ielem,2,3),
+      coords(ielem,2,4),
+      coords(ielem,2,5),
+      coords(ielem,2,6),
+      coords(ielem,2,7)
+    };
+
+    grad( z , y , g );
+
+    grad_op(ielem,0,0) = g[0];
+    grad_op(ielem,0,1) = g[1];
+    grad_op(ielem,0,2) = g[2];
+    grad_op(ielem,0,3) = g[3];
+    grad_op(ielem,0,4) = g[4];
+    grad_op(ielem,0,5) = g[5];
+    grad_op(ielem,0,6) = g[6];
+    grad_op(ielem,0,7) = g[7];
+
+    grad( x , z , g );
+
+    grad_op(ielem,1,0) = g[0];
+    grad_op(ielem,1,1) = g[1];
+    grad_op(ielem,1,2) = g[2];
+    grad_op(ielem,1,3) = g[3];
+    grad_op(ielem,1,4) = g[4];
+    grad_op(ielem,1,5) = g[5];
+    grad_op(ielem,1,6) = g[6];
+    grad_op(ielem,1,7) = g[7];
+
+    grad( y , x , g );
+
+    grad_op(ielem,2,0) = g[0];
+    grad_op(ielem,2,1) = g[1];
+    grad_op(ielem,2,2) = g[2];
+    grad_op(ielem,2,3) = g[3];
+    grad_op(ielem,2,4) = g[4];
+    grad_op(ielem,2,5) = g[5];
+    grad_op(ielem,2,6) = g[6];
+    grad_op(ielem,2,7) = g[7];
+  }
+
+  //--------------------------------------------------------------------------
+
+  struct Init {
+    typedef typename self_type::execution_space execution_space ;
+
+    elem_coord_type coords ;
+
+    Init( const elem_coord_type & arg_coords )
+      : coords( arg_coords ) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void operator()( size_type ielem ) const
+    {
+      coords(ielem,0,0) = 0.;
+      coords(ielem,1,0) = 0.;
+      coords(ielem,2,0) = 0.;
+
+      coords(ielem,0,1) = 1.;
+      coords(ielem,1,1) = 0.;
+      coords(ielem,2,1) = 0.;
+
+      coords(ielem,0,2) = 1.;
+      coords(ielem,1,2) = 1.;
+      coords(ielem,2,2) = 0.;
+
+      coords(ielem,0,3) = 0.;
+      coords(ielem,1,3) = 1.;
+      coords(ielem,2,3) = 0.;
+
+
+      coords(ielem,0,4) = 0.;
+      coords(ielem,1,4) = 0.;
+      coords(ielem,2,4) = 1.;
+
+      coords(ielem,0,5) = 1.;
+      coords(ielem,1,5) = 0.;
+      coords(ielem,2,5) = 1.;
+
+      coords(ielem,0,6) = 1.;
+      coords(ielem,1,6) = 1.;
+      coords(ielem,2,6) = 1.;
+
+      coords(ielem,0,7) = 0.;
+      coords(ielem,1,7) = 1.;
+      coords(ielem,2,7) = 1.;
+    }
+  };
+
+  //--------------------------------------------------------------------------
+
+  static double test( const int count , const int iter = 1 )
+  {
+    elem_coord_type coord( "coord" , count );
+    elem_grad_type  grad ( "grad" , count );
+
+    // Execute the parallel kernels on the arrays:
+
+    double dt_min = 0 ;
+
+    Kokkos::parallel_for( count , Init( coord ) );
+    execution_space::fence();
+
+    for ( int i = 0 ; i < iter ; ++i ) {
+      Kokkos::Impl::Timer timer ;
+      Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
+      execution_space::fence();
+      const double dt = timer.seconds();
+      if ( 0 == i ) dt_min = dt ;
+      else dt_min = dt < dt_min ? dt : dt_min ;
+    }
+
+    return dt_min ;
+  }
+};
+
+}
+
diff --git a/lib/kokkos/core/perf_test/PerfTestHost.cpp b/lib/kokkos/core/perf_test/PerfTestHost.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..6a0f2efadacd01e979d3beefd23b617b81acff48
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestHost.cpp
@@ -0,0 +1,104 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if defined( KOKKOS_HAVE_OPENMP )
+
+typedef Kokkos::OpenMP TestHostDevice ;
+const char TestHostDeviceName[] = "Kokkos::OpenMP" ;
+
+#elif defined( KOKKOS_HAVE_PTHREAD )
+
+typedef Kokkos::Threads TestHostDevice ;
+const char TestHostDeviceName[] = "Kokkos::Threads" ;
+
+#elif defined( KOKKOS_HAVE_SERIAL )
+
+typedef Kokkos::Serial TestHostDevice ;
+const char TestHostDeviceName[] = "Kokkos::Serial" ;
+
+#else
+#  error "You must enable at least one of the following execution spaces in order to build this test: Kokkos::Threads, Kokkos::OpenMP, or Kokkos::Serial."
+#endif
+
+#include <impl/Kokkos_Timer.hpp>
+
+#include <PerfTestHexGrad.hpp>
+#include <PerfTestBlasKernels.hpp>
+#include <PerfTestGramSchmidt.hpp>
+#include <PerfTestDriver.hpp>
+
+//------------------------------------------------------------------------
+
+namespace Test {
+
+class host : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned team_count = Kokkos::hwloc::get_available_numa_count();
+    const unsigned threads_per_team = 4 ;
+
+    TestHostDevice::initialize( team_count * threads_per_team );
+  }
+
+  static void TearDownTestCase()
+  {
+    TestHostDevice::finalize();
+  }
+};
+
+TEST_F( host, hexgrad ) {
+  EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName ));
+}
+
+TEST_F( host, gramschmidt ) {
+  EXPECT_NO_THROW(run_test_gramschmidt< TestHostDevice>( 10, 20, TestHostDeviceName ));
+}
+
+} // namespace Test
+
+
diff --git a/lib/kokkos/core/perf_test/PerfTestMain.cpp b/lib/kokkos/core/perf_test/PerfTestMain.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..ac916308292076fc27231968715518b3f5c02f80
--- /dev/null
+++ b/lib/kokkos/core/perf_test/PerfTestMain.cpp
@@ -0,0 +1,49 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/lib/kokkos/core/perf_test/test_atomic.cpp b/lib/kokkos/core/perf_test/test_atomic.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..f1e5c1b6200474417bc822ed1f9b2a217de51bfd
--- /dev/null
+++ b/lib/kokkos/core/perf_test/test_atomic.cpp
@@ -0,0 +1,504 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <cstdio>
+#include <cstring>
+#include <cstdlib>
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Timer.hpp>
+
+typedef Kokkos::DefaultExecutionSpace exec_space;
+
+#define RESET		0
+#define BRIGHT 		1
+#define DIM		2
+#define UNDERLINE 	3
+#define BLINK		4
+#define REVERSE		7
+#define HIDDEN		8
+
+#define BLACK 		0
+#define RED		1
+#define GREEN		2
+#define YELLOW		3
+#define BLUE		4
+#define MAGENTA		5
+#define CYAN		6
+#define GREY		7
+#define	WHITE		8
+
+void textcolor(int attr, int fg, int bg)
+{	char command[13];
+
+	/* Command is the control command to the terminal */
+	sprintf(command, "%c[%d;%d;%dm", 0x1B, attr, fg + 30, bg + 40);
+	printf("%s", command);
+}
+void textcolor_standard() {textcolor(RESET, BLACK, WHITE);}
+
+
+template<class T,class DEVICE_TYPE>
+struct ZeroFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<T,execution_space> type;
+  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  type data;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    data() = 0;
+  }
+};
+
+//---------------------------------------------------
+//--------------atomic_fetch_add---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct AddFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    Kokkos::atomic_fetch_add(&data(),(T)1);
+  }
+};
+
+template<class T>
+T AddLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct AddFunctor<T,exec_space> f_add;
+  f_add.data = data;
+  Kokkos::parallel_for(loop,f_add);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct AddNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    data()+=(T)1;
+  }
+};
+
+template<class T>
+T AddLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct AddNonAtomicFunctor<T,exec_space> f_add;
+  f_add.data = data;
+  Kokkos::parallel_for(loop,f_add);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T>
+T AddLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++)
+  *data+=(T)1;
+
+  T val = *data;
+  delete data;
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct CASFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	  T old = data();
+	  T newval, assumed;
+	  do {
+	    assumed = old;
+	    newval = assumed + (T)1;
+	    old = Kokkos::atomic_compare_exchange(&data(), assumed, newval);
+	  }
+	  while( old != assumed );
+  }
+};
+
+template<class T>
+T CASLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct CASFunctor<T,exec_space> f_cas;
+  f_cas.data = data;
+  Kokkos::parallel_for(loop,f_cas);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct CASNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	  volatile T assumed;
+	  volatile T newval;
+	  bool fail=1;
+	  do {
+	    assumed = data();
+	    newval = assumed + (T)1;
+	    if(data()==assumed) {
+	    	data() = newval;
+	    	fail = 0;
+	    }
+	  }
+	  while(fail);
+  }
+};
+
+template<class T>
+T CASLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct CASNonAtomicFunctor<T,exec_space> f_cas;
+  f_cas.data = data;
+  Kokkos::parallel_for(loop,f_cas);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T>
+T CASLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++) {
+	  T assumed;
+	  T newval;
+	  T old;
+	  do {
+	    assumed = *data;
+	    newval = assumed + (T)1;
+	    old = *data;
+	    *data = newval;
+	  }
+	  while(!(assumed==old));
+  }
+
+  T val = *data;
+  delete data;
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ExchFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+	T old = Kokkos::atomic_exchange(&data(),(T)i);
+    Kokkos::atomic_fetch_add(&data2(),old);
+  }
+};
+
+template<class T>
+T ExchLoop(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  typename ZeroFunctor<T,exec_space>::type data2("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
+  f_zero.data = data2;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct ExchFunctor<T,exec_space> f_exch;
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for(loop,f_exch);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy(h_data2,data2);
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ExchNonAtomicFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+		T old = data();
+		data()=(T) i;
+		data2()+=old;
+  }
+};
+
+
+template<class T>
+T ExchLoopNonAtomic(int loop) {
+  struct ZeroFunctor<T,exec_space> f_zero;
+  typename ZeroFunctor<T,exec_space>::type data("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  typename ZeroFunctor<T,exec_space>::type data2("Data");
+  typename ZeroFunctor<T,exec_space>::h_type h_data2("HData");
+  f_zero.data = data2;
+  Kokkos::parallel_for(1,f_zero);
+  exec_space::fence();
+
+  struct ExchNonAtomicFunctor<T,exec_space> f_exch;
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for(loop,f_exch);
+  exec_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy(h_data2,data2);
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template<class T>
+T ExchLoopSerial(int loop) {
+  T* data = new T[1];
+  T* data2 = new T[1];
+  data[0] = 0;
+  data2[0] = 0;
+  for(int i=0;i<loop;i++) {
+	T old = *data;
+	*data=(T) i;
+	*data2+=old;
+  }
+
+  T val = *data2 + *data;
+  delete data;
+  delete data2;
+  return val;
+}
+
+template<class T>
+T LoopVariant(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoop<T>(loop);
+    case 2: return CASLoop<T>(loop);
+    case 3: return ExchLoop<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+T LoopVariantSerial(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoopSerial<T>(loop);
+    case 2: return CASLoopSerial<T>(loop);
+    case 3: return ExchLoopSerial<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+T LoopVariantNonAtomic(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoopNonAtomic<T>(loop);
+    case 2: return CASLoopNonAtomic<T>(loop);
+    case 3: return ExchLoopNonAtomic<T>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+void Loop(int loop, int test, const char* type_name) {
+  LoopVariant<T>(loop,test);
+
+  Kokkos::Impl::Timer timer;
+  T res = LoopVariant<T>(loop,test);
+  double time1 = timer.seconds();
+
+  timer.reset();
+  T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
+  double time2 = timer.seconds();
+
+  timer.reset();
+  T resSerial = LoopVariantSerial<T>(loop,test);
+  double time3 = timer.seconds();
+
+  time1*=1e6/loop;
+  time2*=1e6/loop;
+  time3*=1e6/loop;
+  //textcolor_standard();
+  bool passed = true;
+  if(resSerial!=res) passed = false;
+  //if(!passed) textcolor(RESET,BLACK,YELLOW);
+  printf("%s Test %i %s  --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T));
+  //if(!passed) textcolor_standard();
+  printf("\n");
+}
+
+
+template<class T>
+void Test(int loop, int test, const char* type_name) {
+  if(test==-1) {
+    Loop<T>(loop,1,type_name);
+    Loop<T>(loop,2,type_name);
+    Loop<T>(loop,3,type_name);
+
+  }
+  else
+    Loop<T>(loop,test,type_name);
+}
+
+int main(int argc, char* argv[])
+{
+  int type = -1;
+  int loop = 1000000;
+  int test = -1;
+
+  for(int i=0;i<argc;i++)
+  {
+     if((strcmp(argv[i],"--test")==0)) {test=atoi(argv[++i]); continue;}
+     if((strcmp(argv[i],"--type")==0)) {type=atoi(argv[++i]); continue;}
+     if((strcmp(argv[i],"-l")==0)||(strcmp(argv[i],"--loop")==0)) {loop=atoi(argv[++i]); continue;}
+  }
+
+
+  Kokkos::initialize(argc,argv);
+
+
+  printf("Using %s\n",Kokkos::atomic_query_version());
+  bool all_tests = false;
+  if(type==-1) all_tests = true;
+  while(type<100) {
+    if(type==1) {
+     Test<int>(loop,test,"int                    ");
+    }
+    if(type==2) {
+     Test<long int>(loop,test,"long int               ");
+    }
+    if(type==3) {
+     Test<long long int>(loop,test,"long long int          ");
+    }
+    if(type==4) {
+     Test<unsigned int>(loop,test,"unsigned int           ");
+    }
+    if(type==5) {
+     Test<unsigned long int>(loop,test,"unsigned long int      ");
+    }
+    if(type==6) {
+     Test<unsigned long long int>(loop,test,"unsigned long long int ");
+    }
+    if(type==10) {
+     //Test<float>(loop,test,"float                  ");
+    }
+    if(type==11) {
+     Test<double>(loop,test,"double                 ");
+    }
+    if(!all_tests) type=100;
+    else type++;
+  }
+
+  Kokkos::finalize();
+
+}
+
diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..37c5e53e58e901a3519a5c60bdaf5aec001c80e6
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_View.hpp
@@ -0,0 +1,283 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
+#define KOKKOS_EXPERIMENTAL_CUDA_VIEW_HPP
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( KOKKOS_HAVE_CUDA )
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
+// Via reinterpret_case this can be used to support all scalar types of those sizes.
+// Any other scalar type falls back to either normal reads out of global memory,
+// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
+
+template< typename ValueType , typename AliasType >
+struct CudaTextureFetch {
+
+  ::cudaTextureObject_t   m_obj ;
+  const ValueType       * m_ptr ;
+  int                     m_offset ;
+
+  // Deference operator pulls through texture object and returns by value
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+    {
+#if defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+      AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
+      return  *(reinterpret_cast<ValueType*> (&v));
+#else
+      return m_ptr[ i ];
+#endif
+    }
+
+  // Pointer to referenced memory
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_ptr ; }
+
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch() : m_obj() , m_ptr() , m_offset() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaTextureFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs )
+    : m_obj(     rhs.m_obj )
+    , m_ptr(     rhs.m_ptr )
+    , m_offset(  rhs.m_offset )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( CudaTextureFetch && rhs )
+    : m_obj(     rhs.m_obj )
+    , m_ptr(     rhs.m_ptr )
+    , m_offset(  rhs.m_offset )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
+    {
+      m_obj     = rhs.m_obj ;
+      m_ptr     = rhs.m_ptr ;
+      m_offset  = rhs.m_offset ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( CudaTextureFetch && rhs )
+    {
+      m_obj     = rhs.m_obj ;
+      m_ptr     = rhs.m_ptr ;
+      m_offset  = rhs.m_offset ;
+      return *this ;
+    }
+
+  // Texture object spans the entire allocation.
+  // This handle may view a subset of the allocation, so an offset is required.
+  template< class CudaMemorySpace >
+  inline explicit
+  CudaTextureFetch( const ValueType * const arg_ptr
+                  , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record
+                  )
+    // 'attach_texture_object' returns 0 when __CUDA_ARCH__ < 300
+    : m_obj( record.template attach_texture_object< AliasType >() )
+    , m_ptr( arg_ptr )
+    , m_offset( record.attach_texture_object_offset( reinterpret_cast<const AliasType*>( arg_ptr ) ) )
+    {}
+};
+
+#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
+
+template< typename ValueType , typename AliasType >
+struct CudaLDGFetch {
+
+  const ValueType * m_ptr ;
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+    {
+      AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_ptr[i]));
+      return  *(reinterpret_cast<ValueType*> (&v));
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_ptr ; }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch() : m_ptr() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaLDGFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch( const CudaLDGFetch & rhs )
+    : m_ptr( rhs.m_ptr )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch( CudaLDGFetch && rhs )
+    : m_ptr( rhs.m_ptr )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch & operator = ( const CudaLDGFetch & rhs )
+    {
+      m_ptr = rhs.m_ptr ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaLDGFetch & operator = ( CudaLDGFetch && rhs )
+    {
+      m_ptr = rhs.m_ptr ;
+      return *this ;
+    }
+
+  template< class CudaMemorySpace >
+  inline explicit
+  CudaTextureFetch( const ValueType * const arg_ptr
+                  , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const &
+                  )
+    : m_ptr( arg_data_ptr )
+    {}
+};
+
+#endif
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
+ *          if 'const' value type, CudaSpace and random access.
+ */
+template< class Traits >
+class ViewDataHandle< Traits ,
+  typename std::enable_if<(
+    // Is Cuda memory space
+    ( std::is_same< typename Traits::memory_space,Kokkos::CudaSpace>::value ||
+      std::is_same< typename Traits::memory_space,Kokkos::CudaUVMSpace>::value )
+    &&
+    // Is a trivial const value of 4, 8, or 16 bytes
+    std::is_trivial<typename Traits::const_value_type>::value
+    &&
+    std::is_same<typename Traits::const_value_type,typename Traits::value_type>::value
+    &&
+    ( sizeof(typename Traits::const_value_type) ==  4 ||
+      sizeof(typename Traits::const_value_type) ==  8 ||
+      sizeof(typename Traits::const_value_type) == 16 )
+    &&
+    // Random access trait
+    ( Traits::memory_traits::RandomAccess != 0 )
+  )>::type >
+{
+public:
+
+  using track_type  = Kokkos::Experimental::Impl::SharedAllocationTracker ;
+
+  using value_type  = typename Traits::const_value_type ;
+  using return_type = typename Traits::const_value_type ; // NOT a reference
+
+  using alias_type = typename std::conditional< ( sizeof(value_type) ==  4 ) , int ,
+                     typename std::conditional< ( sizeof(value_type) ==  8 ) , ::int2 ,
+                     typename std::conditional< ( sizeof(value_type) == 16 ) , ::int4 , void
+                     >::type
+                     >::type
+                     >::type ;
+
+#if defined( KOKKOS_CUDA_USE_LDG_INTRINSIC )
+  using handle_type = Kokkos::Experimental::Impl::CudaLDGFetch< value_type , alias_type > ;
+#else
+  using handle_type = Kokkos::Experimental::Impl::CudaTextureFetch< value_type , alias_type > ;
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type const & assign( handle_type const & arg_handle , track_type const & /* arg_tracker */ )
+    {
+      return arg_handle ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr, track_type const & arg_tracker )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Assignment of texture = non-texture requires creation of a texture object
+      // which can only occur on the host.  In addition, 'get_record' is only valid
+      // if called in a host execution space
+      return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() );
+#else
+      Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel");
+      return handle_type();
+#endif
+    }
+};
+
+}
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..c1b2d51c477e8f99dad975f4f33757f8af04393a
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp
@@ -0,0 +1,277 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDAEXEC_HPP
+#define KOKKOS_CUDAEXEC_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <string>
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+struct CudaTraits {
+  enum { WarpSize       = 32      /* 0x0020 */ };
+  enum { WarpIndexMask  = 0x001f  /* Mask for warpindex */ };
+  enum { WarpIndexShift = 5       /* WarpSize == 1 << WarpShift */ };
+
+  enum { SharedMemoryBanks    = 32      /* Compute device 2.0 */ };
+  enum { SharedMemoryCapacity = 0x0C000 /* 48k shared / 16k L1 Cache */ };
+  enum { SharedMemoryUsage    = 0x04000 /* 16k shared / 48k L1 Cache */ };
+
+  enum { UpperBoundGridCount    = 65535 /* Hard upper bound */ };
+  enum { ConstantMemoryCapacity = 0x010000 /* 64k bytes */ };
+  enum { ConstantMemoryUsage    = 0x008000 /* 32k bytes */ };
+  enum { ConstantMemoryCache    = 0x002000 /*  8k bytes */ };
+
+  typedef unsigned long
+    ConstantGlobalBufferType[ ConstantMemoryUsage / sizeof(unsigned long) ];
+
+  enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ };
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_count( CudaSpace::size_type i )
+    { return ( i + WarpIndexMask ) >> WarpIndexShift ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  CudaSpace::size_type warp_align( CudaSpace::size_type i )
+    {
+      enum { Mask = ~CudaSpace::size_type( WarpIndexMask ) };
+      return ( i + WarpIndexMask ) & Mask ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+CudaSpace::size_type cuda_internal_maximum_warp_count();
+CudaSpace::size_type cuda_internal_maximum_grid_count();
+CudaSpace::size_type cuda_internal_maximum_shared_words();
+
+CudaSpace::size_type * cuda_internal_scratch_flags( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_space( const CudaSpace::size_type size );
+CudaSpace::size_type * cuda_internal_scratch_unified( const CudaSpace::size_type size );
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __CUDACC__ )
+
+/** \brief  Access to constant memory on the device */
+#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+extern
+#endif
+__device__ __constant__
+Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
+kokkos_impl_cuda_constant_memory_buffer ;
+
+__device__ __constant__
+int* kokkos_impl_cuda_atomic_lock_array ;
+#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
+#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
+
+namespace Kokkos {
+namespace Impl {
+__device__ inline
+bool lock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
+  return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1));
+}
+
+__device__ inline
+void unlock_address_cuda_space(void* ptr) {
+  size_t offset = size_t(ptr);
+  offset = offset >> 2;
+  offset = offset & CUDA_SPACE_ATOMIC_MASK;
+  //offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK;
+  atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0);
+}
+
+}
+}
+
+template< typename T >
+inline
+__device__
+T * kokkos_impl_cuda_shared_memory()
+{ extern __shared__ Kokkos::CudaSpace::size_type sh[]; return (T*) sh ; }
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize L1 cache and minimize shared memory:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferL1 );
+// For 2.0 capability: 48 KB L1 and 16 KB shared
+//----------------------------------------------------------------------------
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch_constant_memory()
+{
+  const DriverType & driver =
+    *((const DriverType *) kokkos_impl_cuda_constant_memory_buffer );
+
+  driver();
+}
+
+template< class DriverType >
+__global__
+static void cuda_parallel_launch_local_memory( const DriverType driver )
+{
+  driver();
+}
+
+template < class DriverType ,
+           bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) >
+struct CudaParallelLaunch ;
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType , true > {
+
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) <
+           sizeof( DriverType ) ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") );
+      }
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      else if ( shmem ) {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferShared );
+      } else {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType > , cudaFuncCachePreferL1 );
+      }
+
+      // Copy functor to constant memory on the device
+      cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
+
+      int* lock_array_ptr = lock_array_cuda_space_ptr();
+      cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+
+      // Invoke the driver function on the device
+      cuda_parallel_launch_constant_memory< DriverType ><<< grid , block , shmem , stream >>>();
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      Kokkos::Cuda::fence();
+      CUDA_SAFE_CALL( cudaGetLastError() );
+#endif
+    }
+  }
+};
+
+template < class DriverType >
+struct CudaParallelLaunch< DriverType , false > {
+
+  inline
+  CudaParallelLaunch( const DriverType & driver
+                    , const dim3       & grid
+                    , const dim3       & block
+                    , const int          shmem
+                    , const cudaStream_t stream = 0 )
+  {
+    if ( grid.x && ( block.x * block.y * block.z ) ) {
+
+      if ( CudaTraits::SharedMemoryCapacity < shmem ) {
+        Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") );
+      }
+      else if ( shmem ) {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferShared );
+      } else {
+        cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType > , cudaFuncCachePreferL1 );
+      }
+
+      int* lock_array_ptr = lock_array_cuda_space_ptr();
+      cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+
+      cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+      Kokkos::Cuda::fence();
+      CUDA_SAFE_CALL( cudaGetLastError() );
+#endif
+    }
+  }
+};
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* defined( __CUDACC__ ) */
+#endif /* defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDAEXEC_HPP */
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..5b397845c351887cbcc80f9abf31ba2d2615dedc
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
@@ -0,0 +1,670 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Kokkos_Cuda.hpp>
+#include <Kokkos_CudaSpace.hpp>
+
+#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+DeepCopy<CudaSpace,CudaSpace>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<CudaSpace,CudaSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+DeepCopy<HostSpace,CudaSpace>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<HostSpace,CudaSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+DeepCopy<CudaSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpy( dst , src , n , cudaMemcpyDefault ) ); }
+
+DeepCopy<CudaSpace,HostSpace>::DeepCopy( const Cuda & instance , void * dst , const void * src , size_t n )
+{ CUDA_SAFE_CALL( cudaMemcpyAsync( dst , src , n , cudaMemcpyDefault , instance.cuda_stream() ) ); }
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+
+namespace Kokkos {
+
+namespace {
+
+void texture_object_attach_impl(  Impl::AllocationTracker const & tracker
+                                , unsigned type_size
+                                , ::cudaChannelFormatDesc const & desc
+                               )
+{
+  enum { TEXTURE_BOUND_1D = 2u << 27 };
+
+  if ( tracker.attribute() == NULL ) {
+    // check for correct allocator
+    const bool ok_alloc =  tracker.allocator()->support_texture_binding();
+
+    const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D;
+
+    if (ok_alloc && ok_count) {
+      Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc );
+      tracker.set_attribute( attr );
+    }
+    else {
+      std::ostringstream oss;
+      oss << "Error: Cannot attach texture object";
+      if (!ok_alloc) {
+        oss << ", incompatabile allocator " << tracker.allocator()->name();
+      }
+      if (!ok_count) {
+        oss << ", array " << tracker.label() << " too large";
+      }
+      oss << ".";
+      Kokkos::Impl::throw_runtime_exception( oss.str() );
+    }
+  }
+
+  if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) {
+    std::ostringstream oss;
+    oss << "Error: Allocation " << tracker.label() << " already has an attribute attached.";
+    Kokkos::Impl::throw_runtime_exception( oss.str() );
+  }
+
+}
+
+} // unnamed namespace
+
+/*--------------------------------------------------------------------------*/
+
+Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size )
+{
+  return Impl::AllocationTracker( allocator(), size, label);
+}
+
+void CudaSpace::texture_object_attach(  Impl::AllocationTracker const & tracker
+                                      , unsigned type_size
+                                      , ::cudaChannelFormatDesc const & desc
+                                     )
+{
+  texture_object_attach_impl( tracker, type_size, desc );
+}
+
+void CudaSpace::access_error()
+{
+  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+void CudaSpace::access_error( const void * const )
+{
+  const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
+  Kokkos::Impl::throw_runtime_exception( msg );
+}
+
+/*--------------------------------------------------------------------------*/
+
+Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
+{
+  return Impl::AllocationTracker( allocator(), size, label);
+}
+
+void CudaUVMSpace::texture_object_attach(  Impl::AllocationTracker const & tracker
+                                         , unsigned type_size
+                                         , ::cudaChannelFormatDesc const & desc
+                                        )
+{
+  texture_object_attach_impl( tracker, type_size, desc );
+}
+
+bool CudaUVMSpace::available()
+{
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
+  enum { UVM_available = true };
+#else
+  enum { UVM_available = false };
+#endif
+  return UVM_available;
+}
+
+/*--------------------------------------------------------------------------*/
+
+Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
+{
+  return Impl::AllocationTracker( allocator(), size, label);
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+CudaSpace::CudaSpace()
+  : m_device( Kokkos::Cuda().cuda_device() )
+{
+}
+
+CudaUVMSpace::CudaUVMSpace()
+  : m_device( Kokkos::Cuda().cuda_device() )
+{
+}
+
+CudaHostPinnedSpace::CudaHostPinnedSpace()
+{
+}
+
+void * CudaSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaMalloc( &ptr, arg_alloc_size ) );
+
+  return ptr ;
+}
+
+void * CudaUVMSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaMallocManaged( &ptr, arg_alloc_size , cudaMemAttachGlobal ) );
+
+  return ptr ;
+}
+
+void * CudaHostPinnedSpace::allocate( const size_t arg_alloc_size ) const
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaHostAlloc( &ptr, arg_alloc_size , cudaHostAllocDefault ) );
+
+  return ptr ;
+}
+
+void CudaSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+void CudaUVMSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+void CudaHostPinnedSpace::deallocate( void * const arg_alloc_ptr , const size_t /* arg_alloc_size */ ) const
+{
+  try {
+    CUDA_SAFE_CALL( cudaFreeHost( arg_alloc_ptr ) );
+  } catch(...) {}
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record ;
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record ;
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record ;
+
+::cudaTextureObject_t
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+attach_texture_object( const unsigned sizeof_alias
+                     , void *   const alloc_ptr
+                     , size_t   const alloc_size )
+{
+  // Only valid for 300 <= __CUDA_ARCH__
+  // otherwise return zero.
+
+  ::cudaTextureObject_t tex_obj ;
+
+  struct cudaResourceDesc resDesc ;
+  struct cudaTextureDesc  texDesc ;
+
+  memset( & resDesc , 0 , sizeof(resDesc) );
+  memset( & texDesc , 0 , sizeof(texDesc) );
+
+  resDesc.resType                = cudaResourceTypeLinear ;
+  resDesc.res.linear.desc        = ( sizeof_alias ==  4 ?  cudaCreateChannelDesc< int >() :
+                                   ( sizeof_alias ==  8 ?  cudaCreateChannelDesc< ::int2 >() :
+                                  /* sizeof_alias == 16 */ cudaCreateChannelDesc< ::int4 >() ) );
+  resDesc.res.linear.sizeInBytes = alloc_size ;
+  resDesc.res.linear.devPtr      = alloc_ptr ;
+
+  CUDA_SAFE_CALL( cudaCreateTextureObject( & tex_obj , & resDesc, & texDesc, NULL ) );
+
+  return tex_obj ;
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaSpace , void >::get_label() const
+{
+  SharedAllocationHeader header ;
+
+  Kokkos::Impl::DeepCopy< Kokkos::HostSpace , Kokkos::CudaSpace >( & header , RecordBase::head() , sizeof(SharedAllocationHeader) );
+
+  return std::string( header.m_label );
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_label() const
+{
+  return std::string( RecordBase::head()->m_label );
+}
+
+std::string
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_label() const
+{
+  return std::string( RecordBase::head()->m_label );
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void > *
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+allocate( const Kokkos::CudaSpace &  arg_space
+        , const std::string       &  arg_label
+        , const size_t               arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void > *
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+allocate( const Kokkos::CudaUVMSpace &  arg_space
+        , const std::string          &  arg_label
+        , const size_t                  arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+allocate( const Kokkos::CudaHostPinnedSpace &  arg_space
+        , const std::string                 &  arg_label
+        , const size_t                         arg_alloc_size
+        )
+{
+  return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_tex_obj( 0 )
+  , m_space( arg_space )
+{
+  SharedAllocationHeader header ;
+
+  // Fill in the Header information
+  header.m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( header.m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+
+  // Copy to device memory
+  Kokkos::Impl::DeepCopy<CudaSpace,HostSpace>::DeepCopy( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) );
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
+                      , const std::string          & arg_label
+                      , const size_t                 arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_tex_obj( 0 )
+  , m_space( arg_space )
+{
+  // Fill in the Header information, directly accessible via UVM
+
+  RecordBase::m_alloc_ptr->m_record = this ;
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
+                      , const std::string                 & arg_label
+                      , const size_t                        arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  // Fill in the Header information, directly accessible via UVM
+
+  RecordBase::m_alloc_ptr->m_record = this ;
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+SharedAllocationRecord< Kokkos::CudaSpace , void > *
+SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordBase = SharedAllocationRecord< void , void > ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaSpace , void > ;
+
+#if 0
+  // Copy the header from the allocation
+  SharedAllocationHeader head ;
+
+  SharedAllocationHeader const * const head_cuda = Header::get_header( alloc_ptr );
+
+  Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) );
+
+  RecordCuda * const record = static_cast< RecordCuda * >( head.m_record );
+
+  if ( record->m_alloc_ptr != head_cuda ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
+  }
+
+#else
+
+  // Iterate the list to search for the record among all allocations
+  // requires obtaining the root of the list and then locking the list.
+
+  RecordCuda * const record = static_cast< RecordCuda * >( RecordBase::find( & s_root_record , alloc_ptr ) );
+
+  if ( record == 0 ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record ERROR" ) );
+  }
+
+#endif
+
+  return record ;
+}
+
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void > *
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
+
+  Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ;
+
+  if ( h->m_record->m_alloc_ptr != h ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::get_record ERROR" ) );
+  }
+
+  return static_cast< RecordCuda * >( h->m_record );
+}
+
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > *
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void * alloc_ptr )
+{
+  using Header     = SharedAllocationHeader ;
+  using RecordCuda = SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > ;
+
+  Header * const h = reinterpret_cast< Header * >( alloc_ptr ) - 1 ;
+
+  if ( h->m_record->m_alloc_ptr != h ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record ERROR" ) );
+  }
+
+  return static_cast< RecordCuda * >( h->m_record );
+}
+
+// Iterate records to print orphaned memory ...
+void
+SharedAllocationRecord< Kokkos::CudaSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void > * r = & s_root_record ;
+
+  char buffer[256] ;
+
+  SharedAllocationHeader head ;
+
+  if ( detail ) {
+    do {
+      if ( r->m_alloc_ptr ) {
+        Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+      }
+      else {
+        head.m_label[0] = 0 ;
+      }
+
+      snprintf( buffer , 256 , "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"
+              , reinterpret_cast<unsigned long>( r )
+              , reinterpret_cast<unsigned long>( r->m_prev )
+              , reinterpret_cast<unsigned long>( r->m_next )
+              , reinterpret_cast<unsigned long>( r->m_alloc_ptr )
+              , r->m_alloc_size
+              , r->m_count
+              , reinterpret_cast<unsigned long>( r->m_dealloc )
+              , head.m_label
+              );
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+  else {
+    do {
+      if ( r->m_alloc_ptr ) {
+
+        Kokkos::Impl::DeepCopy<HostSpace,CudaSpace>::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) );
+
+        snprintf( buffer , 256 , "Cuda [ 0x%.12lx + %ld ] %s\n"
+                , reinterpret_cast< unsigned long >( r->data() )
+                , r->size()
+                , head.m_label
+                );
+      }
+      else {
+        snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" );
+      }
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != & s_root_record );
+  }
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail );
+}
+
+void
+SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
+print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+  __global__ void init_lock_array_kernel() {
+    unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(i<CUDA_SPACE_ATOMIC_MASK+1)
+      kokkos_impl_cuda_atomic_lock_array[i] = 0;
+  }
+}
+
+namespace Impl {
+int* lock_array_cuda_space_ptr(bool deallocate) {
+  static int* ptr = NULL;
+  if(deallocate) {
+    cudaFree(ptr);
+    ptr = NULL;
+  }
+
+  if(ptr==NULL && !deallocate)
+    cudaMalloc(&ptr,sizeof(int)*(CUDA_SPACE_ATOMIC_MASK+1));
+  return ptr;
+}
+
+void init_lock_array_cuda_space() {
+  int is_initialized = 0;
+  if(! is_initialized) {
+    int* lock_array_ptr = lock_array_cuda_space_ptr();
+    cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) );
+    init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
+  }
+}
+
+}
+}
+#endif // KOKKOS_HAVE_CUDA
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..e1314c0e511a96e82250d1ad39985f52547f5a51
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Alloc.hpp
@@ -0,0 +1,183 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+#define KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
+
+namespace Kokkos {
+namespace Impl {
+
+template< class DestructFunctor >
+SharedAllocationRecord *
+shared_allocation_record( Kokkos::CudaSpace const & arg_space
+                        , void *            const   arg_alloc_ptr
+                        , DestructFunctor   const & arg_destruct )
+{
+  SharedAllocationRecord * const record = SharedAllocationRecord::get_record( arg_alloc_ptr );
+
+  // assert: record != 0
+
+  // assert: sizeof(DestructFunctor) <= record->m_destruct_size
+
+  // assert: record->m_destruct_function == 0
+
+  DestructFunctor * const functor =
+    reinterpret_cast< DestructFunctor * >(
+    reinterpret_cast< unsigned long >( record ) + sizeof(SharedAllocationRecord) );
+
+  new( functor ) DestructFunctor( arg_destruct );
+
+  record->m_destruct_functor = & shared_allocation_destroy< DestructFunctor > ;
+  
+  return record ;
+}
+
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedUVMAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged UVM Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedHostAllocator
+/// does nothing when deallocate(ptr,size) is called
+class CudaUnmanagedHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Unmanaged Host Allocator";
+  }
+  // Unmanaged deallocate does nothing
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+};
+
+/// class CudaMallocAllocator
+class CudaMallocAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Malloc Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUVMAllocator
+class CudaUVMAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda UVM Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaHostAllocator
+class CudaHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Host Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_HAVE_CUDA
+
+#endif // #ifndef KOKKOS_CUDA_ALLOCATION_TRACKING_HPP
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..8c8c5e47a5b13eebc7c09b8e69d5fb728b4988c4
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
@@ -0,0 +1,192 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+#include <sstream>
+
+namespace Kokkos { namespace Impl {
+
+
+/*--------------------------------------------------------------------------*/
+TextureAttribute::TextureAttribute(  void * const alloc_ptr
+                                   , size_t alloc_size
+                                   , cudaChannelFormatDesc const & desc
+                                  )
+  : m_tex_obj(0)
+{
+  cuda_device_synchronize();
+
+  struct cudaResourceDesc resDesc ;
+  struct cudaTextureDesc  texDesc ;
+
+  memset( & resDesc , 0 , sizeof(resDesc) );
+  memset( & texDesc , 0 , sizeof(texDesc) );
+
+  resDesc.resType                = cudaResourceTypeLinear ;
+  resDesc.res.linear.desc        = desc ;
+  resDesc.res.linear.sizeInBytes = alloc_size ;
+  resDesc.res.linear.devPtr      = alloc_ptr ;
+
+  CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) );
+
+  cuda_device_synchronize();
+}
+
+
+TextureAttribute::~TextureAttribute()
+{
+  if (m_tex_obj) {
+    cudaDestroyTextureObject( m_tex_obj );
+  }
+}
+
+/*--------------------------------------------------------------------------*/
+
+void * CudaMallocAllocator::allocate( size_t size )
+{
+  void * ptr = NULL;
+
+  CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) );
+
+  return ptr;
+}
+
+void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( ptr ) );
+  } catch(...) {}
+}
+
+void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  void * ptr = old_ptr;
+  if (old_size != new_size) {
+    ptr = allocate( new_size );
+    size_t copy_size = old_size < new_size ? old_size : new_size;
+
+    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
+
+    deallocate( old_ptr, old_size );
+  }
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------*/
+
+void * CudaUVMAllocator::allocate( size_t size )
+{
+#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
+  void * ptr = NULL;
+  CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) );
+  return ptr;
+#else
+  throw_runtime_exception( "CUDA VERSION does not support UVM" );
+  return NULL;
+#endif
+}
+
+void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ )
+{
+  try {
+    CUDA_SAFE_CALL( cudaFree( ptr ) );
+  } catch(...) {}
+}
+
+void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  void * ptr = old_ptr;
+  if (old_size != new_size) {
+    ptr = allocate( new_size );
+    size_t copy_size = old_size < new_size ? old_size : new_size;
+
+    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
+
+    deallocate( old_ptr, old_size );
+  }
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------*/
+
+void * CudaHostAllocator::allocate( size_t size )
+{
+  void * ptr = NULL;
+  CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) );
+  return ptr;
+}
+
+void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ )
+{
+  try {
+    CUDA_SAFE_CALL( cudaFreeHost( ptr ) );
+  } catch(...) {}
+}
+
+void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  void * ptr = old_ptr;
+  if (old_size != new_size) {
+    ptr = allocate( new_size );
+    size_t copy_size = old_size < new_size ? old_size : new_size;
+
+    CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) );
+
+    deallocate( old_ptr, old_size );
+  }
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------*/
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_HAVE_CUDA
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..86fe1c901bcbe62dd0f1e97e9b933a17da6283d7
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BasicAllocators.hpp
@@ -0,0 +1,187 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
+#define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
+
+namespace Kokkos { namespace Impl {
+
+
+// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
+// to be an 'unsigned long long'.  This chould change with
+// future version of Cuda and this typedef would have to
+// change accordingly.
+
+#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
+
+typedef enable_if<
+  sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
+  ::cudaTextureObject_t >::type cuda_texture_object_type ;
+
+#else
+
+typedef const void * cuda_texture_object_type ;
+
+#endif
+
+
+struct TextureAttribute : public AllocatorAttributeBase
+{
+  cuda_texture_object_type m_tex_obj ;
+
+  TextureAttribute(  void * const alloc_ptr
+                   , size_t alloc_size
+                   , cudaChannelFormatDesc const & desc
+                  );
+
+  ~TextureAttribute();
+};
+
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+struct CudaUnmanagedUVMAllocator
+{
+  static const char * name()
+  {
+    return "Cuda Unmanaged UVM Allocator";
+  }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUnmanagedHostAllocator
+/// does nothing when deallocate(ptr,size) is called
+class CudaUnmanagedHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Unmanaged Host Allocator";
+  }
+  // Unmanaged deallocate does nothing
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+};
+
+/// class CudaMallocAllocator
+class CudaMallocAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Malloc Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaUVMAllocator
+class CudaUVMAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda UVM Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+
+  static bool support_texture_binding() { return true; }
+};
+
+/// class CudaHostAllocator
+class CudaHostAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Cuda Host Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_HAVE_CUDA
+
+#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..a0b29ddc2b270212f9c8b9d18e6ee394b9a61b39
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Error.hpp
@@ -0,0 +1,69 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ERROR_HPP
+#define KOKKOS_CUDA_ERROR_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+namespace Kokkos { namespace Impl {
+
+void cuda_device_synchronize();
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file = NULL, const int line = 0 );
+
+inline void cuda_internal_safe_call( cudaError e , const char * name, const char * file = NULL, const int line = 0)
+{
+  if ( cudaSuccess != e ) { cuda_internal_error_throw( e , name, file, line ); }
+}
+
+#define CUDA_SAFE_CALL( call )  \
+	Kokkos::Impl::cuda_internal_safe_call( call , #call, __FILE__, __LINE__ )
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_HAVE_CUDA
+#endif //KOKKOS_CUDA_ERROR_HPP
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..b7c3a62d39934369e1ec1a5089f13abf1dfa94a5
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp
@@ -0,0 +1,678 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+/* Kokkos interfaces */
+
+#include <Kokkos_Core.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/* Standard 'C' libraries */
+#include <stdlib.h>
+
+/* Standard 'C++' libraries */
+#include <vector>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
+__device__ __constant__
+Kokkos::Impl::CudaTraits::ConstantGlobalBufferType
+kokkos_impl_cuda_constant_memory_buffer ;
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+namespace {
+
+__global__
+void query_cuda_kernel_arch( int * d_arch )
+{
+#if defined( __CUDA_ARCH__ )
+  *d_arch = __CUDA_ARCH__ ;
+#else
+  *d_arch = 0 ;
+#endif
+}
+
+/** Query what compute capability is actually launched to the device: */
+int cuda_kernel_arch()
+{
+  int * d_arch = 0 ;
+  cudaMalloc( (void **) & d_arch , sizeof(int) );
+  query_cuda_kernel_arch<<<1,1>>>( d_arch );
+  int arch = 0 ;
+  cudaMemcpy( & arch , d_arch , sizeof(int) , cudaMemcpyDefault );
+  cudaFree( d_arch );
+  return arch ;
+}
+
+bool cuda_launch_blocking()
+{
+  const char * env = getenv("CUDA_LAUNCH_BLOCKING");
+
+  if (env == 0) return false;
+
+  return atoi(env);
+}
+
+}
+
+void cuda_device_synchronize()
+{
+//  static const bool launch_blocking = cuda_launch_blocking();
+
+//  if (!launch_blocking) {
+    CUDA_SAFE_CALL( cudaDeviceSynchronize() );
+//  }
+}
+
+void cuda_internal_error_throw( cudaError e , const char * name, const char * file, const int line )
+{
+  std::ostringstream out ;
+  out << name << " error( " << cudaGetErrorName(e) << "): " << cudaGetErrorString(e);
+  if (file) {
+    out << " " << file << ":" << line;
+  }
+  throw_runtime_exception( out.str() );
+}
+
+//----------------------------------------------------------------------------
+// Some significant cuda device properties:
+//
+// cudaDeviceProp::name                : Text label for device
+// cudaDeviceProp::major               : Device major number
+// cudaDeviceProp::minor               : Device minor number
+// cudaDeviceProp::warpSize            : number of threads per warp
+// cudaDeviceProp::multiProcessorCount : number of multiprocessors
+// cudaDeviceProp::sharedMemPerBlock   : capacity of shared memory per block
+// cudaDeviceProp::totalConstMem       : capacity of constant memory
+// cudaDeviceProp::totalGlobalMem      : capacity of global memory
+// cudaDeviceProp::maxGridSize[3]      : maximum grid size
+
+//
+//  Section 4.4.2.4 of the CUDA Toolkit Reference Manual
+//
+// struct cudaDeviceProp {
+//   char name[256];
+//   size_t totalGlobalMem;
+//   size_t sharedMemPerBlock;
+//   int regsPerBlock;
+//   int warpSize;
+//   size_t memPitch;
+//   int maxThreadsPerBlock;
+//   int maxThreadsDim[3];
+//   int maxGridSize[3];
+//   size_t totalConstMem;
+//   int major;
+//   int minor;
+//   int clockRate;
+//   size_t textureAlignment;
+//   int deviceOverlap;
+//   int multiProcessorCount;
+//   int kernelExecTimeoutEnabled;
+//   int integrated;
+//   int canMapHostMemory;
+//   int computeMode;
+//   int concurrentKernels;
+//   int ECCEnabled;
+//   int pciBusID;
+//   int pciDeviceID;
+//   int tccDriver;
+//   int asyncEngineCount;
+//   int unifiedAddressing;
+//   int memoryClockRate;
+//   int memoryBusWidth;
+//   int l2CacheSize;
+//   int maxThreadsPerMultiProcessor;
+// };
+
+
+namespace {
+
+
+
+class CudaInternalDevices {
+public:
+  enum { MAXIMUM_DEVICE_COUNT = 8 };
+  struct cudaDeviceProp  m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
+  int                    m_cudaDevCount ;
+
+  CudaInternalDevices();
+
+  static const CudaInternalDevices & singleton();
+};
+
+CudaInternalDevices::CudaInternalDevices()
+{
+  // See 'cudaSetDeviceFlags' for host-device thread interaction
+  // Section 4.4.2.6 of the CUDA Toolkit Reference Manual
+
+  CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
+
+  for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
+    CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
+  }
+}
+
+const CudaInternalDevices & CudaInternalDevices::singleton()
+{
+  static CudaInternalDevices self ; return self ;
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+class CudaInternal {
+private:
+
+  CudaInternal( const CudaInternal & );
+  CudaInternal & operator = ( const CudaInternal & );
+
+  AllocationTracker m_scratchFlagsTracker;
+  AllocationTracker m_scratchSpaceTracker;
+  AllocationTracker m_scratchUnifiedTracker;
+
+
+public:
+
+  typedef Cuda::size_type size_type ;
+
+  int         m_cudaDev ;
+  int         m_cudaArch ;
+  unsigned    m_maxWarpCount ;
+  unsigned    m_maxBlock ;
+  unsigned    m_maxSharedWords ;
+  size_type   m_scratchSpaceCount ;
+  size_type   m_scratchFlagsCount ;
+  size_type   m_scratchUnifiedCount ;
+  size_type   m_scratchUnifiedSupported ;
+  size_type   m_streamCount ;
+  size_type * m_scratchSpace ;
+  size_type * m_scratchFlags ;
+  size_type * m_scratchUnified ;
+  cudaStream_t * m_stream ;
+
+
+  static CudaInternal & singleton();
+
+  int verify_is_initialized( const char * const label ) const ;
+
+  int is_initialized() const
+    { return 0 != m_scratchSpace && 0 != m_scratchFlags ; }
+
+  void initialize( int cuda_device_id , int stream_count );
+  void finalize();
+
+  void print_configuration( std::ostream & ) const ;
+
+  ~CudaInternal();
+
+  CudaInternal()
+    : m_cudaDev( -1 )
+    , m_cudaArch( -1 )
+    , m_maxWarpCount( 0 )
+    , m_maxBlock( 0 )
+    , m_maxSharedWords( 0 )
+    , m_scratchSpaceCount( 0 )
+    , m_scratchFlagsCount( 0 )
+    , m_scratchUnifiedCount( 0 )
+    , m_scratchUnifiedSupported( 0 )
+    , m_streamCount( 0 )
+    , m_scratchSpace( 0 )
+    , m_scratchFlags( 0 )
+    , m_scratchUnified( 0 )
+    , m_stream( 0 )
+    {}
+
+  size_type * scratch_space( const size_type size );
+  size_type * scratch_flags( const size_type size );
+  size_type * scratch_unified( const size_type size );
+};
+
+//----------------------------------------------------------------------------
+
+
+void CudaInternal::print_configuration( std::ostream & s ) const
+{
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+#if defined( KOKKOS_HAVE_CUDA )
+    s << "macro  KOKKOS_HAVE_CUDA      : defined" << std::endl ;
+#endif
+#if defined( CUDA_VERSION )
+    s << "macro  CUDA_VERSION          = " << CUDA_VERSION
+      << " = version " << CUDA_VERSION / 1000
+      << "." << ( CUDA_VERSION % 1000 ) / 10
+      << std::endl ;
+#endif
+
+  for ( int i = 0 ; i < dev_info.m_cudaDevCount ; ++i ) {
+    s << "Kokkos::Cuda[ " << i << " ] "
+      << dev_info.m_cudaProp[i].name
+      << " capability " << dev_info.m_cudaProp[i].major << "." << dev_info.m_cudaProp[i].minor
+      << ", Total Global Memory: " << human_memory_size(dev_info.m_cudaProp[i].totalGlobalMem)
+      << ", Shared Memory per Block: " << human_memory_size(dev_info.m_cudaProp[i].sharedMemPerBlock);
+    if ( m_cudaDev == i ) s << " : Selected" ;
+    s << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+CudaInternal::~CudaInternal()
+{
+  if ( m_stream ||
+       m_scratchSpace ||
+       m_scratchFlags ||
+       m_scratchUnified ) {
+    std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()"
+              << std::endl ;
+    std::cerr.flush();
+  }
+
+  m_cudaDev                 = -1 ;
+  m_cudaArch                = -1 ;
+  m_maxWarpCount            = 0 ;
+  m_maxBlock                = 0 ;
+  m_maxSharedWords          = 0 ;
+  m_scratchSpaceCount       = 0 ;
+  m_scratchFlagsCount       = 0 ;
+  m_scratchUnifiedCount     = 0 ;
+  m_scratchUnifiedSupported = 0 ;
+  m_streamCount             = 0 ;
+  m_scratchSpace            = 0 ;
+  m_scratchFlags            = 0 ;
+  m_scratchUnified          = 0 ;
+  m_stream                  = 0 ;
+}
+
+int CudaInternal::verify_is_initialized( const char * const label ) const
+{
+  if ( m_cudaDev < 0 ) {
+    std::cerr << "Kokkos::Cuda::" << label << " : ERROR device not initialized" << std::endl ;
+  }
+  return 0 <= m_cudaDev ;
+}
+
+CudaInternal & CudaInternal::singleton()
+{
+  static CudaInternal self ;
+  return self ;
+}
+
+void CudaInternal::initialize( int cuda_device_id , int stream_count )
+{
+  enum { WordSize = sizeof(size_type) };
+
+  if ( ! HostSpace::execution_space::is_initialized() ) {
+    const std::string msg("Cuda::initialize ERROR : HostSpace::execution_space is not initialized");
+    throw_runtime_exception( msg );
+  }
+
+  const CudaInternalDevices & dev_info = CudaInternalDevices::singleton();
+
+  const bool ok_init = 0 == m_scratchSpace || 0 == m_scratchFlags ;
+
+  const bool ok_id   = 0 <= cuda_device_id &&
+                            cuda_device_id < dev_info.m_cudaDevCount ;
+
+  // Need device capability 2.0 or better
+
+  const bool ok_dev = ok_id &&
+    ( 2 <= dev_info.m_cudaProp[ cuda_device_id ].major &&
+      0 <= dev_info.m_cudaProp[ cuda_device_id ].minor );
+
+  if ( ok_init && ok_dev ) {
+
+    const struct cudaDeviceProp & cudaProp =
+      dev_info.m_cudaProp[ cuda_device_id ];
+
+    m_cudaDev = cuda_device_id ;
+
+    CUDA_SAFE_CALL( cudaSetDevice( m_cudaDev ) );
+    CUDA_SAFE_CALL( cudaDeviceReset() );
+    Kokkos::Impl::cuda_device_synchronize();
+
+    // Query what compute capability architecture a kernel executes:
+    m_cudaArch = cuda_kernel_arch();
+
+    if ( m_cudaArch != cudaProp.major * 100 + cudaProp.minor * 10 ) {
+      std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability "
+                << ( m_cudaArch / 100 ) << "." << ( ( m_cudaArch % 100 ) / 10 )
+                << " on device with compute capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " , this will likely reduce potential performance."
+                << std::endl ;
+    }
+
+    //----------------------------------
+    // Maximum number of warps,
+    // at most one warp per thread in a warp for reduction.
+
+    // HCE 2012-February :
+    // Found bug in CUDA 4.1 that sometimes a kernel launch would fail
+    // if the thread count == 1024 and a functor is passed to the kernel.
+    // Copying the kernel to constant memory and then launching with
+    // thread count == 1024 would work fine.
+    //
+    // HCE 2012-October :
+    // All compute capabilities support at least 16 warps (512 threads).
+    // However, we have found that 8 warps typically gives better performance.
+
+    m_maxWarpCount = 8 ;
+
+    // m_maxWarpCount = cudaProp.maxThreadsPerBlock / Impl::CudaTraits::WarpSize ;
+
+    if ( Impl::CudaTraits::WarpSize < m_maxWarpCount ) {
+      m_maxWarpCount = Impl::CudaTraits::WarpSize ;
+    }
+
+    m_maxSharedWords = cudaProp.sharedMemPerBlock / WordSize ;
+
+    //----------------------------------
+    // Maximum number of blocks:
+
+    m_maxBlock = m_cudaArch < 300 ? 65535 : cudaProp.maxGridSize[0] ;
+
+    //----------------------------------
+
+    m_scratchUnifiedSupported = cudaProp.unifiedAddressing ;
+
+    if ( ! m_scratchUnifiedSupported ) {
+      std::cout << "Kokkos::Cuda device "
+                << cudaProp.name << " capability "
+                << cudaProp.major << "." << cudaProp.minor
+                << " does not support unified virtual address space"
+                << std::endl ;
+    }
+
+    //----------------------------------
+    // Multiblock reduction uses scratch flags for counters
+    // and scratch space for partial reduction values.
+    // Allocate some initial space.  This will grow as needed.
+
+    {
+      const unsigned reduce_block_count = m_maxWarpCount * Impl::CudaTraits::WarpSize ;
+
+      (void) scratch_unified( 16 * sizeof(size_type) );
+      (void) scratch_flags( reduce_block_count * 2  * sizeof(size_type) );
+      (void) scratch_space( reduce_block_count * 16 * sizeof(size_type) );
+    }
+    //----------------------------------
+
+    if ( stream_count ) {
+      m_stream = (cudaStream_t*) ::malloc( stream_count * sizeof(cudaStream_t) );
+      m_streamCount = stream_count ;
+      for ( size_type i = 0 ; i < m_streamCount ; ++i ) m_stream[i] = 0 ;
+    }
+  }
+  else {
+
+    std::ostringstream msg ;
+    msg << "Kokkos::Cuda::initialize(" << cuda_device_id << ") FAILED" ;
+
+    if ( ! ok_init ) {
+      msg << " : Already initialized" ;
+    }
+    if ( ! ok_id ) {
+      msg << " : Device identifier out of range "
+          << "[0.." << dev_info.m_cudaDevCount << "]" ;
+    }
+    else if ( ! ok_dev ) {
+      msg << " : Device " ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].major ;
+      msg << "." ;
+      msg << dev_info.m_cudaProp[ cuda_device_id ].minor ;
+      msg << " has insufficient capability, required 2.0 or better" ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_cuda_space();
+
+}
+
+//----------------------------------------------------------------------------
+
+typedef Cuda::size_type ScratchGrain[ Impl::CudaTraits::WarpSize ] ;
+enum { sizeScratchGrain = sizeof(ScratchGrain) };
+
+
+Cuda::size_type *
+CudaInternal::scratch_flags( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_flags") && m_scratchFlagsCount * sizeScratchGrain < size ) {
+
+
+    m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
+    m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
+
+    CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
+  }
+
+  return m_scratchFlags ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_space( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_space") && m_scratchSpaceCount * sizeScratchGrain < size ) {
+
+    m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
+    m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
+
+  }
+
+  return m_scratchSpace ;
+}
+
+Cuda::size_type *
+CudaInternal::scratch_unified( const Cuda::size_type size )
+{
+  if ( verify_is_initialized("scratch_unified") &&
+       m_scratchUnifiedSupported && m_scratchUnifiedCount * sizeScratchGrain < size ) {
+
+    m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
+
+    m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
+    m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
+  }
+
+  return m_scratchUnified ;
+}
+
+//----------------------------------------------------------------------------
+
+void CudaInternal::finalize()
+{
+  if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
+
+    lock_array_cuda_space_ptr(true);
+    if ( m_stream ) {
+      for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
+        cudaStreamDestroy( m_stream[i] );
+        m_stream[i] = 0 ;
+      }
+      ::free( m_stream );
+    }
+
+    m_scratchSpaceTracker.clear();
+    m_scratchFlagsTracker.clear();
+    m_scratchUnifiedTracker.clear();
+
+    m_cudaDev             = -1 ;
+    m_maxWarpCount        = 0 ;
+    m_maxBlock            = 0 ;
+    m_maxSharedWords      = 0 ;
+    m_scratchSpaceCount   = 0 ;
+    m_scratchFlagsCount   = 0 ;
+    m_scratchUnifiedCount = 0 ;
+    m_streamCount         = 0 ;
+    m_scratchSpace        = 0 ;
+    m_scratchFlags        = 0 ;
+    m_scratchUnified      = 0 ;
+    m_stream              = 0 ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+Cuda::size_type cuda_internal_maximum_warp_count()
+{ return CudaInternal::singleton().m_maxWarpCount ; }
+
+Cuda::size_type cuda_internal_maximum_grid_count()
+{ return CudaInternal::singleton().m_maxBlock ; }
+
+Cuda::size_type cuda_internal_maximum_shared_words()
+{ return CudaInternal::singleton().m_maxSharedWords ; }
+
+Cuda::size_type * cuda_internal_scratch_space( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_space( size ); }
+
+Cuda::size_type * cuda_internal_scratch_flags( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_flags( size ); }
+
+Cuda::size_type * cuda_internal_scratch_unified( const Cuda::size_type size )
+{ return CudaInternal::singleton().scratch_unified( size ); }
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+Cuda::size_type Cuda::detect_device_count()
+{ return Impl::CudaInternalDevices::singleton().m_cudaDevCount ; }
+
+int Cuda::is_initialized()
+{ return Impl::CudaInternal::singleton().is_initialized(); }
+
+void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
+{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); }
+
+std::vector<unsigned>
+Cuda::detect_device_arch()
+{
+  const Impl::CudaInternalDevices & s = Impl::CudaInternalDevices::singleton();
+
+  std::vector<unsigned> output( s.m_cudaDevCount );
+
+  for ( int i = 0 ; i < s.m_cudaDevCount ; ++i ) {
+    output[i] = s.m_cudaProp[i].major * 100 + s.m_cudaProp[i].minor ;
+  }
+
+  return output ;
+}
+
+Cuda::size_type Cuda::device_arch()
+{
+  const int dev_id = Impl::CudaInternal::singleton().m_cudaDev ;
+
+  int dev_arch = 0 ;
+
+  if ( 0 <= dev_id ) {
+    const struct cudaDeviceProp & cudaProp =
+      Impl::CudaInternalDevices::singleton().m_cudaProp[ dev_id ] ;
+
+    dev_arch = cudaProp.major * 100 + cudaProp.minor ;
+  }
+
+  return dev_arch ;
+}
+
+void Cuda::finalize()
+{ Impl::CudaInternal::singleton().finalize(); }
+
+Cuda::Cuda()
+  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
+  , m_stream( 0 )
+{
+  Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" );
+}
+
+Cuda::Cuda( const int instance_id )
+  : m_device( Impl::CudaInternal::singleton().m_cudaDev )
+  , m_stream(
+      Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor" )
+        ? Impl::CudaInternal::singleton().m_stream[ instance_id % Impl::CudaInternal::singleton().m_streamCount ]
+        : 0 )
+{}
+
+void Cuda::print_configuration( std::ostream & s , const bool )
+{ Impl::CudaInternal::singleton().print_configuration( s ); }
+
+bool Cuda::sleep() { return false ; }
+
+bool Cuda::wake() { return true ; }
+
+void Cuda::fence()
+{
+  Kokkos::Impl::cuda_device_synchronize();
+}
+
+} // namespace Kokkos
+
+#endif // KOKKOS_HAVE_CUDA
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..dd8a08729b25792f9a62be0e1afbfedbbfcebd08
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Internal.hpp
@@ -0,0 +1,165 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_INTERNAL_HPP
+#define KOKKOS_CUDA_INTERNAL_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Cuda/Kokkos_Cuda_Error.hpp>
+
+namespace Kokkos { namespace Impl {
+
+
+template<class DriverType>
+int cuda_get_max_block_size(const typename DriverType::functor_type & f) {
+#if ( CUDA_VERSION < 6050 )
+  return 256;
+#else
+  bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
+
+  int numBlocks;
+  if(Large) {
+    int blockSize=32;
+    int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocks,
+        cuda_parallel_launch_constant_memory<DriverType>,
+        blockSize,
+        sharedmem);
+
+    while (blockSize<1024 && numBlocks>0) {
+      blockSize*=2;
+      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &numBlocks,
+          cuda_parallel_launch_constant_memory<DriverType>,
+          blockSize,
+          sharedmem);
+    }
+    if(numBlocks>0) return blockSize;
+    else return blockSize/2;
+  } else {
+    int blockSize=32;
+    int sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &numBlocks,
+        cuda_parallel_launch_local_memory<DriverType>,
+        blockSize,
+        sharedmem);
+
+    while (blockSize<1024 && numBlocks>0) {
+      blockSize*=2;
+      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+          &numBlocks,
+          cuda_parallel_launch_local_memory<DriverType>,
+          blockSize,
+          sharedmem);
+    }
+    if(numBlocks>0) return blockSize;
+    else return blockSize/2;
+  }
+#endif
+}
+
+template<class DriverType>
+int cuda_get_opt_block_size(const typename DriverType::functor_type & f) {
+#if ( CUDA_VERSION < 6050 )
+  return 256;
+#else
+  bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) );
+
+  int blockSize=16;
+  int numBlocks;
+  int sharedmem;
+  int maxOccupancy=0;
+  int bestBlockSize=0;
+
+  if(Large) {
+    while(blockSize<1024) {
+      blockSize*=2;
+
+      //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
+      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &numBlocks,
+              cuda_parallel_launch_constant_memory<DriverType>,
+              blockSize,
+              sharedmem);
+      if(maxOccupancy < numBlocks*blockSize) {
+        maxOccupancy = numBlocks*blockSize;
+        bestBlockSize = blockSize;
+      }
+    }
+  } else {
+    while(blockSize<1024) {
+      blockSize*=2;
+      sharedmem = FunctorTeamShmemSize< typename DriverType::functor_type  >::value( f , blockSize );
+
+      cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+              &numBlocks,
+              cuda_parallel_launch_local_memory<DriverType>,
+              blockSize,
+              sharedmem);
+
+      if(maxOccupancy < numBlocks*blockSize) {
+        maxOccupancy = numBlocks*blockSize;
+        bestBlockSize = blockSize;
+      }
+    }
+  }
+  return bestBlockSize;
+#endif
+}
+
+}} // namespace Kokkos::Impl
+
+#endif // KOKKOS_HAVE_CUDA
+#endif /* #ifndef KOKKOS_CUDA_INTERNAL_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..ce33c978c711051694eb052fcce29b07ae081335
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp
@@ -0,0 +1,1799 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_PARALLEL_HPP
+#define KOKKOS_CUDA_PARALLEL_HPP
+
+#include <iostream>
+#include <stdio.h>
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <utility>
+#include <Kokkos_Parallel.hpp>
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_ReduceScan.hpp>
+#include <Cuda/Kokkos_Cuda_Internal.hpp>
+#include <Kokkos_Vectorization.hpp>
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename Type >
+struct CudaJoinFunctor {
+  typedef Type value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    volatile const value_type & input )
+    { update += input ; }
+};
+
+class CudaTeamMember {
+private:
+
+  typedef Kokkos::Cuda                           execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  void                * m_team_reduce ;
+  scratch_memory_space  m_team_shared ;
+  int                   m_league_rank ;
+  int                   m_league_size ;
+
+public:
+
+#if defined( __CUDA_ARCH__ )
+
+  __device__ inline
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared ; }
+
+  __device__ inline int league_rank() const { return m_league_rank ; }
+  __device__ inline int league_size() const { return m_league_size ; }
+  __device__ inline int team_rank() const { return threadIdx.y ; }
+  __device__ inline int team_size() const { return blockDim.y ; }
+
+  __device__ inline void team_barrier() const { __syncthreads(); }
+
+  template<class ValueType>
+  __device__ inline void team_broadcast(ValueType& value, const int& thread_id) const {
+    __shared__ ValueType sh_val;
+    if(threadIdx.x == 0 && threadIdx.y == thread_id) {
+      sh_val = val;
+    }
+    team_barrier();
+    val = sh_val;
+  }
+
+#ifdef KOKKOS_HAVE_CXX11
+  template< class ValueType, class JoinOp >
+  __device__ inline
+  typename JoinOp::value_type team_reduce( const ValueType & value
+                                         , const JoinOp & op_in ) const
+    {
+      typedef JoinLambdaAdapter<ValueType,JoinOp> JoinOpFunctor ;
+      const JoinOpFunctor op(op_in);
+      ValueType * const base_data = (ValueType *) m_team_reduce ;
+#else
+  template< class JoinOp >
+  __device__ inline
+  typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value
+                                         , const JoinOp & op ) const
+    {
+      typedef JoinOp JoinOpFunctor ;
+      typename JoinOp::value_type * const base_data = (typename JoinOp::value_type *) m_team_reduce ;
+#endif
+
+      __syncthreads(); // Don't write in to shared data until all threads have entered this function
+
+      if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }
+
+      base_data[ threadIdx.y ] = value ;
+
+      Impl::cuda_intra_block_reduce_scan<false,JoinOpFunctor,void>( op , base_data );
+
+      return base_data[ blockDim.y - 1 ];
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  __device__ inline Type team_scan( const Type & value , Type * const global_accum ) const
+    {
+      Type * const base_data = (Type *) m_team_reduce ;
+
+      __syncthreads(); // Don't write in to shared data until all threads have entered this function
+
+      if ( 0 == threadIdx.y ) { base_data[0] = 0 ; }
+
+      base_data[ threadIdx.y + 1 ] = value ;
+
+      Impl::cuda_intra_block_reduce_scan<true,Impl::CudaJoinFunctor<Type>,void>( Impl::CudaJoinFunctor<Type>() , base_data + 1 );
+
+      if ( global_accum ) {
+        if ( blockDim.y == threadIdx.y + 1 ) {
+          base_data[ blockDim.y ] = atomic_fetch_add( global_accum , base_data[ blockDim.y ] );
+        }
+        __syncthreads(); // Wait for atomic
+        base_data[ threadIdx.y ] += base_data[ blockDim.y ] ;
+      }
+
+      return base_data[ threadIdx.y ];
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  __device__ inline Type team_scan( const Type & value ) const
+    { return this->template team_scan<Type>( value , 0 ); }
+
+  //----------------------------------------
+  // Private for the driver
+
+  __device__ inline
+  CudaTeamMember( void * shared
+                , const int shared_begin
+                , const int shared_size
+                , const int arg_league_rank
+                , const int arg_league_size )
+    : m_team_reduce( shared )
+    , m_team_shared( ((char *)shared) + shared_begin , shared_size )
+    , m_league_rank( arg_league_rank ) 
+    , m_league_size( arg_league_size ) 
+    {}
+
+#else
+
+  const execution_space::scratch_memory_space & team_shmem() const {return m_team_shared;}
+
+  int league_rank() const {return 0;}
+  int league_size() const {return 1;}
+  int team_rank() const {return 0;}
+  int team_size() const {return 1;}
+
+  void team_barrier() const {}
+  template<class ValueType>
+  void team_broadcast(ValueType& value, const int& thread_id) const {}
+
+  template< class JoinOp >
+  typename JoinOp::value_type team_reduce( const typename JoinOp::value_type & value
+                                         , const JoinOp & op ) const {return typename JoinOp::value_type();}
+
+  template< typename Type >
+  Type team_scan( const Type & value , Type * const global_accum ) const {return Type();}
+
+  template< typename Type >
+  Type team_scan( const Type & value ) const {return Type();}
+
+  //----------------------------------------
+  // Private for the driver
+
+  CudaTeamMember( void * shared
+                , const int shared_begin
+                , const int shared_end
+                , const int arg_league_rank
+                , const int arg_league_size );
+
+#endif /* #if ! defined( __CUDA_ARCH__ ) */
+
+};
+
+} // namespace Impl
+
+template< class Arg0 , class Arg1 >
+class TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda >
+{
+private:
+
+  enum { MAX_WARP = 8 };
+
+  const int m_league_size ;
+  const int m_team_size ;
+  const int m_vector_length ;
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicy     execution_policy ;
+
+  //! Execution space of this execution policy
+  typedef Kokkos::Cuda  execution_space ;
+
+  typedef typename
+    Impl::if_c< ! Impl::is_same< Kokkos::Cuda , Arg0 >::value , Arg0 , Arg1 >::type
+      work_tag ;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & functor )
+    {
+      int n = MAX_WARP * Impl::CudaTraits::WarpSize ;
+
+      for ( ; n ; n >>= 1 ) {
+        const int shmem_size =
+          /* for global reduce */ Impl::cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( functor , n )
+          /* for team   reduce */ + ( n + 2 ) * sizeof(double)
+          /* for team   shared */ + Impl::FunctorTeamShmemSize< FunctorType >::value( functor , n );
+
+        if ( shmem_size < Impl::CudaTraits::SharedMemoryCapacity ) break ;
+      }
+
+      return n ;
+    }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & functor )
+    { return team_size_max( functor ); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & functor , const int vector_length)
+    {
+      int max = team_size_max( functor )/vector_length;
+      if(max<1) max = 1;
+      return max;
+    }
+
+  inline static
+  int vector_length_max()
+    { return Impl::CudaTraits::WarpSize; }
+
+  //----------------------------------------
+
+  inline int vector_length()   const { return m_vector_length ; }
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size_ , int team_size_request , int vector_length_request = 1 )
+    : m_league_size( league_size_ )
+    , m_team_size( team_size_request )
+    , m_vector_length ( vector_length_request )
+    {
+      // Allow only power-of-two vector_length
+      int check = 0;
+      for(int k = 1; k <= vector_length_max(); k*=2)
+        if(k == vector_length_request)
+          check = 1;
+      if(!check)
+        Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
+
+      // Make sure league size is permissable
+      if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
+        Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+    }
+
+  TeamPolicy( int league_size_ , int team_size_request , int vector_length_request = 1 )
+    : m_league_size( league_size_ )
+    , m_team_size( team_size_request )
+    , m_vector_length ( vector_length_request )
+    {
+      // Allow only power-of-two vector_length
+      int check = 0;
+      for(int k = 1; k <= vector_length_max(); k*=2)
+        if(k == vector_length_request)
+          check = 1;
+      if(!check)
+        Impl::throw_runtime_exception( "Requested non-power-of-two vector length for TeamPolicy.");
+
+      // Make sure league size is permissable
+      if(league_size_ >= int(Impl::cuda_internal_maximum_grid_count()))
+        Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on Cuda execution space.");
+
+    }
+
+  typedef Kokkos::Impl::CudaTeamMember member_type ;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > Policy ;
+
+  const FunctorType  m_functor ;
+  const Policy       m_policy ;  
+
+  ParallelFor();
+  ParallelFor & operator = ( const ParallelFor & );
+
+  template< class Tag >
+  inline static
+  __device__
+  void driver( const FunctorType & functor
+             , typename Impl::enable_if< Impl::is_same< Tag , void >::value
+               , typename Policy::member_type const & >::type iwork
+             )
+    { functor( iwork ); }
+
+  template< class Tag >
+  inline static
+  __device__
+  void driver( const FunctorType & functor
+             , typename Impl::enable_if< ! Impl::is_same< Tag , void >::value
+               , typename Policy::member_type const & >::type iwork
+             )
+    { functor( Tag() , iwork ); }
+
+public:
+
+  typedef FunctorType functor_type ;
+
+  inline
+  __device__
+  void operator()(void) const
+    {
+      const typename Policy::member_type work_stride = blockDim.y * gridDim.x ;
+      const typename Policy::member_type work_end    = m_policy.end();
+
+      for ( typename Policy::member_type
+              iwork =  m_policy.begin() + threadIdx.y + blockDim.y * blockIdx.x ;
+              iwork <  work_end ;
+              iwork += work_stride ) {
+        ParallelFor::template driver< typename Policy::work_tag >( m_functor, iwork );
+      }
+    }
+
+  ParallelFor( const FunctorType  & functor ,
+               const Policy       & policy )
+    : m_functor( functor )
+    , m_policy(  policy )
+    {
+      const dim3 block(  1 , CudaTraits::WarpSize * cuda_internal_maximum_warp_count(), 1);
+      const dim3 grid( std::min( ( int( policy.end() - policy.begin() ) + block.y - 1 ) / block.y
+                               , cuda_internal_maximum_grid_count() )
+                     , 1 , 1);
+
+      CudaParallelLaunch< ParallelFor >( *this , grid , block , 0 );
+    }
+};
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda > >
+{
+private:
+
+  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda >   Policy ;
+
+public:
+
+  typedef FunctorType      functor_type ;
+  typedef Cuda::size_type  size_type ;
+
+private:
+
+  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
+  // shared memory utilization:
+  //
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const FunctorType m_functor ;
+  size_type         m_shmem_begin ;
+  size_type         m_shmem_size ;
+  size_type         m_league_size ;
+
+  template< class TagType >
+  __device__ inline
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member ) const
+    { m_functor( member ); }
+
+  template< class TagType >
+  __device__ inline
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type  member ) const
+    { m_functor( TagType() , member ); }
+
+public:
+
+  __device__ inline
+  void operator()(void) const
+  {
+    // Iterate this block through the league
+    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+
+      ParallelFor::template driver< typename Policy::work_tag >(
+        typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
+                                    , m_shmem_begin
+                                    , m_shmem_size
+                                    , league_rank
+                                    , m_league_size ) );
+    }
+  }
+
+
+  ParallelFor( const FunctorType  & functor 
+             , const Policy       & policy 
+             )
+  : m_functor( functor )
+  , m_shmem_begin( sizeof(double) * ( policy.team_size() + 2 ) )
+  , m_shmem_size( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+  , m_league_size( policy.league_size() )
+  {
+    // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+
+    const int shmem_size_total = m_shmem_begin + m_shmem_size ;
+
+    if ( CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelFor< Cuda > insufficient shared memory"));
+    }
+
+    const dim3 grid( int(policy.league_size()) , 1 , 1 );
+    const dim3 block( policy.vector_length() , policy.team_size() , 1 );
+
+    CudaParallelLaunch< ParallelFor >( *this, grid, block, shmem_size_total ); // copy to device and execute
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > >
+{
+private:
+
+  typedef Kokkos::RangePolicy<Arg0,Arg1,Arg2, Kokkos::Cuda >         Policy ;
+  typedef typename Policy::WorkRange                                 work_range ;
+  typedef typename Policy::work_tag                                  work_tag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag > ValueInit ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::value_type      value_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef FunctorType                           functor_type ;
+  typedef Cuda::size_type                       size_type ;
+
+  // Algorithmic constraints: blockSize is a power of two AND blockDim.y == blockDim.z == 1
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+  size_type *       m_unified_space ;
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      unsigned n = CudaTraits::WarpSize * 8 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  template< class Tag >
+  inline static
+  __device__
+  void driver( const FunctorType & functor
+             , typename Impl::enable_if< Impl::is_same< Tag , void >::value
+               , typename Policy::member_type const & >::type iwork
+             , reference_type value )
+    { functor( iwork , value ); }
+
+  template< class Tag >
+  inline static
+  __device__
+  void driver( const FunctorType & functor
+             , typename Impl::enable_if< ! Impl::is_same< Tag , void >::value
+               , typename Policy::member_type const & >::type iwork
+             , reference_type value )
+    { functor( Tag() , iwork , value ); }
+
+#ifndef KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION
+  __device__ inline
+  void operator()(void) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
+
+    {
+      reference_type value =
+        ValueInit::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+
+      // Number of blocks is bounded so that the reduction can be limited to two passes.
+      // Each thread block is given an approximately equal amount of work to perform.
+      // Accumulate the values for this block.
+      // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+      const work_range range( m_policy , blockIdx.x , gridDim.x );
+
+      for ( typename work_range::member_type iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
+            iwork < iwork_end ; iwork += blockDim.y ) {
+        ParallelReduce::template driver< work_tag >( m_functor , iwork , value );
+      }
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    if ( cuda_single_inter_block_reduce_scan<false,FunctorType,work_tag>(
+           m_functor , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.y == 0 ) {
+        Kokkos::Impl::FunctorFinal< FunctorType , work_tag >::final( m_functor , shared );
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
+    }
+  }
+#else
+  __device__ inline
+   void operator()(void) const
+   {
+
+     value_type value = 0;
+
+     // Number of blocks is bounded so that the reduction can be limited to two passes.
+     // Each thread block is given an approximately equal amount of work to perform.
+     // Accumulate the values for this block.
+     // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+     const Policy range( m_policy , blockIdx.x , gridDim.x );
+
+     for ( typename Policy::member_type iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
+           iwork < iwork_end ; iwork += blockDim.y ) {
+       ParallelReduce::template driver< work_tag >( m_functor , iwork , value );
+     }
+
+     pointer_type const result = (pointer_type) (m_unified_space ? m_unified_space : m_scratch_space) ;
+     int max_active_thread = range.end()-range.begin() < blockDim.y ? range.end() - range.begin():blockDim.y;
+     max_active_thread = max_active_thread == 0?blockDim.y:max_active_thread;
+     if(Impl::cuda_inter_block_reduction<FunctorType,Impl::JoinAdd<value_type> >
+            (value,Impl::JoinAdd<value_type>(),m_scratch_space,result,m_scratch_flags,max_active_thread)) {
+       const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+       if(id==0) {
+         Kokkos::Impl::FunctorFinal< FunctorType , work_tag >::final( m_functor , (void*) &value );
+         *result = value;
+       }
+     }
+   }
+#endif
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & functor 
+                , const Policy       & policy 
+                , const HostViewType & result
+                )
+  : m_functor( functor )
+  , m_policy(  policy )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  {
+    const int block_size  = local_block_size( functor );
+    const int block_count = std::min( int(block_size)
+                                    , ( int(policy.end() - policy.begin()) + block_size - 1 ) / block_size
+                                    );
+
+    m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( functor ) * block_count );
+    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+    m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( functor ) );
+
+    const dim3 grid( block_count , 1 , 1 );
+    const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )
+#ifdef KOKKOS_EXPERIMENTAL_CUDA_SHFL_REDUCTION
+    const int shmem = 0;
+#else
+    const int shmem = cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( m_functor , block.y );
+#endif
+
+    CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem ); // copy to device and execute
+
+    Cuda::fence();
+
+    if ( result.ptr_on_device() ) {
+      if ( m_unified_space ) {
+        const int count = ValueTraits::value_count( m_functor );
+        for ( int i = 0 ; i < count ; ++i ) { result.ptr_on_device()[i] = pointer_type(m_unified_space)[i] ; }
+      }
+      else {
+        const int size = ValueTraits::value_size( m_functor );
+        DeepCopy<HostSpace,CudaSpace>( result.ptr_on_device() , m_scratch_space , size );
+      }
+    }
+  }
+};
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Cuda > >
+{
+private:
+
+  typedef Kokkos::TeamPolicy<Arg0,Arg1,Kokkos::Cuda>                  Policy ;
+  typedef typename Policy::work_tag                                   work_tag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
+  typedef typename ValueTraits::pointer_type                          pointer_type ;
+  typedef typename ValueTraits::reference_type                        reference_type ;
+
+public:
+
+  typedef FunctorType      functor_type ;
+  typedef Cuda::size_type  size_type ;
+
+private:
+
+  // Algorithmic constraints: blockDim.y is a power of two AND blockDim.y == blockDim.z == 1
+  // shared memory utilization:
+  //
+  //  [ global reduce space ]
+  //  [ team   reduce space ]
+  //  [ team   shared space ]
+  //
+
+  const FunctorType m_functor ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+  size_type *       m_unified_space ;
+  size_type         m_team_begin ;
+  size_type         m_shmem_begin ;
+  size_type         m_shmem_size ;
+  size_type         m_league_size ;
+
+  template< class TagType >
+  __device__ inline
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type  member 
+             , reference_type update ) const
+    { m_functor( member , update ); }
+
+  template< class TagType >
+  __device__ inline
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type  member 
+             , reference_type update ) const
+    { m_functor( TagType() , member , update ); }
+
+public:
+
+  __device__ inline
+  void operator()(void) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
+
+    reference_type value =
+      ValueInit::init( m_functor , kokkos_impl_cuda_shared_memory<size_type>() + threadIdx.y * word_count.value );
+
+    // Iterate this block through the league
+    for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
+
+      ParallelReduce::template driver< work_tag >
+        ( typename Policy::member_type( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
+                                        , m_shmem_begin
+                                        , m_shmem_size
+                                        , league_rank
+                                        , m_league_size )
+        , value );
+    }
+
+    // Reduce with final value at blockDim.y - 1 location.
+    if ( cuda_single_inter_block_reduce_scan<false,FunctorType,work_tag>(
+           m_functor , blockIdx.x , gridDim.x ,
+           kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags ) ) {
+
+      // This is the final block with the final result at the final threads' location
+
+      size_type * const shared = kokkos_impl_cuda_shared_memory<size_type>() + ( blockDim.y - 1 ) * word_count.value ;
+      size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ;
+
+      if ( threadIdx.y == 0 ) {
+        Kokkos::Impl::FunctorFinal< FunctorType , work_tag >::final( m_functor , shared );
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); }
+
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i]; }
+    }
+  }
+
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & functor 
+                , const Policy       & policy 
+                , const HostViewType & result
+                )
+  : m_functor( functor )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_unified_space( 0 )
+  , m_team_begin( cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( functor , policy.team_size() ) )
+  , m_shmem_begin( sizeof(double) * ( policy.team_size() + 2 ) )
+  , m_shmem_size( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+  , m_league_size( policy.league_size() )
+  {
+
+    // The global parallel_reduce does not support vector_length other than 1 at the moment
+    if(policy.vector_length() > 1)
+      Impl::throw_runtime_exception( "Kokkos::parallel_reduce with a TeamPolicy using a vector length of greater than 1 is not currently supported for CUDA.");
+
+    // Functor's reduce memory, team scan memory, and team shared memory depend upon team size.
+
+    const int shmem_size_total = m_team_begin + m_shmem_begin + m_shmem_size ;
+    const int not_power_of_two = 0 != ( policy.team_size() & ( policy.team_size() - 1 ) );
+
+    if ( not_power_of_two ||  CudaTraits::SharedMemoryCapacity < shmem_size_total ) {
+      Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > bad team size"));
+    }
+
+    const int block_count = std::min( policy.league_size() , policy.team_size() );
+
+    m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( functor ) * block_count );
+    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) );
+    m_unified_space = cuda_internal_scratch_unified( ValueTraits::value_size( functor ) );
+
+    const dim3 grid( block_count , 1 , 1 );
+    const dim3 block( 1 , policy.team_size() , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )
+
+    CudaParallelLaunch< ParallelReduce >( *this, grid, block, shmem_size_total ); // copy to device and execute
+
+    Cuda::fence();
+
+    if ( result.ptr_on_device() ) {
+      if ( m_unified_space ) {
+        const int count = ValueTraits::value_count( m_functor );
+        for ( int i = 0 ; i < count ; ++i ) { result.ptr_on_device()[i] = pointer_type(m_unified_space)[i] ; }
+      }
+      else {
+        const int size = ValueTraits::value_size( m_functor );
+        DeepCopy<HostSpace,CudaSpace>( result.ptr_on_device() , m_scratch_space , size );
+      }
+    }
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Cuda > >
+{
+private:
+
+  typedef Kokkos::RangePolicy<Arg0,Arg1,Arg2, Kokkos::Cuda >          Policy ;
+  typedef typename Policy::WorkRange                                  work_range ;
+  typedef typename Policy::work_tag                                   work_tag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , work_tag >  ValueOps ;
+
+public:
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+  typedef FunctorType                           functor_type ;
+  typedef Cuda::size_type                       size_type ;
+
+  // Algorithmic constraints:
+  //  (a) blockDim.y is a power of two
+  //  (b) blockDim.y == blockDim.z == 1
+  //  (c) gridDim.x  <= blockDim.y * blockDim.y
+  //  (d) gridDim.y  == gridDim.z == 1
+
+  // Determine block size constrained by shared memory:
+  static inline
+  unsigned local_block_size( const FunctorType & f )
+    {
+      // blockDim.y must be power of two = 128 (4 warps) or 256 (8 warps) or 512 (16 warps)
+      // gridDim.x <= blockDim.y * blockDim.y
+      //
+      // 4 warps was 10% faster than 8 warps and 20% faster than 16 warps in unit testing
+
+      unsigned n = CudaTraits::WarpSize * 4 ;
+      while ( n && CudaTraits::SharedMemoryCapacity < cuda_single_inter_block_reduce_scan_shmem<false,FunctorType,work_tag>( f , n ) ) { n >>= 1 ; }
+      return n ;
+    }
+
+  const FunctorType m_functor ;
+  const Policy      m_policy ;
+  size_type *       m_scratch_space ;
+  size_type *       m_scratch_flags ;
+        size_type   m_final ;
+  
+  template< class Tag >
+  inline static
+  __device__
+  void driver( const FunctorType & functor
+             , typename Impl::enable_if< Impl::is_same< Tag , void >::value
+               , typename Policy::member_type const & >::type iwork
+             , reference_type value 
+             , const bool     final )
+    { functor( iwork , value , final ); }
+
+  template< class Tag >
+  inline static
+  __device__
+  void driver( const FunctorType & functor
+             , typename Impl::enable_if< ! Impl::is_same< Tag , void >::value
+               , typename Policy::member_type const & >::type iwork
+             , reference_type value
+             , const bool     final )
+    { functor( Tag() , iwork , value , final ); }
+
+  //----------------------------------------
+
+  __device__ inline
+  void initial(void) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
+
+    size_type * const shared_value = kokkos_impl_cuda_shared_memory<size_type>() + word_count.value * threadIdx.y ;
+
+    ValueInit::init( m_functor , shared_value );
+
+    // Number of blocks is bounded so that the reduction can be limited to two passes.
+    // Each thread block is given an approximately equal amount of work to perform.
+    // Accumulate the values for this block.
+    // The accumulation ordering does not match the final pass, but is arithmatically equivalent.
+
+    const work_range range( m_policy , blockIdx.x , gridDim.x );
+
+    for ( typename Policy::member_type iwork = range.begin() + threadIdx.y , iwork_end = range.end() ;
+          iwork < iwork_end ; iwork += blockDim.y ) {
+      ParallelScan::template driver< work_tag >
+        ( m_functor , iwork , ValueOps::reference( shared_value ) , false );
+    }
+
+    // Reduce and scan, writing out scan of blocks' totals and block-groups' totals.
+    // Blocks' scan values are written to 'blockIdx.x' location.
+    // Block-groups' scan values are at: i = ( j * blockDim.y - 1 ) for i < gridDim.x
+    cuda_single_inter_block_reduce_scan<true,FunctorType,work_tag>( m_functor , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory<size_type>() , m_scratch_space , m_scratch_flags );
+  }
+
+  //----------------------------------------
+
+  __device__ inline
+  void final(void) const
+  {
+    const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+      word_count( ValueTraits::value_size( m_functor ) / sizeof(size_type) );
+
+    // Use shared memory as an exclusive scan: { 0 , value[0] , value[1] , value[2] , ... }
+    size_type * const shared_data   = kokkos_impl_cuda_shared_memory<size_type>();
+    size_type * const shared_prefix = shared_data + word_count.value * threadIdx.y ;
+    size_type * const shared_accum  = shared_data + word_count.value * ( blockDim.y + 1 );
+
+    // Starting value for this thread block is the previous block's total.
+    if ( blockIdx.x ) {
+      size_type * const block_total = m_scratch_space + word_count.value * ( blockIdx.x - 1 );
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i] ; }
+    }
+    else if ( 0 == threadIdx.y ) {
+      ValueInit::init( m_functor , shared_accum );
+    }
+
+    const work_range range( m_policy , blockIdx.x , gridDim.x );
+
+    for ( typename Policy::member_type iwork_base = range.begin(); iwork_base < range.end() ; iwork_base += blockDim.y ) {
+
+      const typename Policy::member_type iwork = iwork_base + threadIdx.y ;
+
+      __syncthreads(); // Don't overwrite previous iteration values until they are used
+
+      ValueInit::init( m_functor , shared_prefix + word_count.value );
+
+      // Copy previous block's accumulation total into thread[0] prefix and inclusive scan value of this block
+      for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) {
+        shared_data[i + word_count.value] = shared_data[i] = shared_accum[i] ;
+      }
+
+      if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } // Protect against large scan values.
+
+      // Call functor to accumulate inclusive scan value for this work item
+      if ( iwork < range.end() ) {
+        ParallelScan::template driver< work_tag >
+          ( m_functor , iwork , ValueOps::reference( shared_prefix + word_count.value ) , false );
+      }
+
+      // Scan block values into locations shared_data[1..blockDim.y]
+      cuda_intra_block_reduce_scan<true,FunctorType,work_tag>( m_functor , ValueTraits::pointer_type(shared_data+word_count.value) );
+
+      {
+        size_type * const block_total = shared_data + word_count.value * blockDim.y ;
+        for ( unsigned i = threadIdx.y ; i < word_count.value ; ++i ) { shared_accum[i] = block_total[i]; }
+      }
+
+      // Call functor with exclusive scan value
+      if ( iwork < range.end() ) {
+        ParallelScan::template driver< work_tag >
+          ( m_functor , iwork , ValueOps::reference( shared_prefix ) , true );
+      }
+    }
+  }
+
+  //----------------------------------------
+
+  __device__ inline
+  void operator()(void) const
+  {
+    if ( ! m_final ) {
+      initial();
+    }
+    else {
+      final();
+    }
+  }
+
+  ParallelScan( const FunctorType  & functor ,
+                const Policy       & policy )
+  : m_functor( functor )
+  , m_policy( policy )
+  , m_scratch_space( 0 )
+  , m_scratch_flags( 0 )
+  , m_final( false )
+  {
+    enum { GridMaxComputeCapability_2x = 0x0ffff };
+
+    const int block_size = local_block_size( functor );
+
+    const int grid_max = ( block_size * block_size ) < GridMaxComputeCapability_2x ?
+                         ( block_size * block_size ) : GridMaxComputeCapability_2x ;
+
+    // At most 'max_grid' blocks:
+    const int nwork    = policy.end() - policy.begin();
+    const int max_grid = std::min( int(grid_max) , int(( nwork + block_size - 1 ) / block_size ));
+
+    // How much work per block:
+    const int work_per_block = ( nwork + max_grid - 1 ) / max_grid ;
+
+    // How many block are really needed for this much work:
+    const dim3 grid( ( nwork + work_per_block - 1 ) / work_per_block , 1 , 1 );
+    const dim3 block( 1 , block_size , 1 ); // REQUIRED DIMENSIONS ( 1 , N , 1 )
+    const int shmem = ValueTraits::value_size( functor ) * ( block_size + 2 );
+
+    m_scratch_space = cuda_internal_scratch_space( ValueTraits::value_size( functor ) * grid.x );
+    m_scratch_flags = cuda_internal_scratch_flags( sizeof(size_type) * 1 );
+
+    m_final = false ;
+    CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+
+    m_final = true ;
+    CudaParallelLaunch< ParallelScan >( *this, grid, block, shmem ); // copy to device and execute
+  }
+
+  void wait() const { Cuda::fence(); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+  template<typename iType>
+  struct TeamThreadRangeBoundariesStruct<iType,CudaTeamMember> {
+    typedef iType index_type;
+    const iType start;
+    const iType end;
+    const iType increment;
+    const CudaTeamMember& thread;
+
+#ifdef __CUDA_ARCH__
+    __device__ inline
+    TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
+      start( threadIdx.y ),
+      end( count ),
+      increment( blockDim.y ),
+      thread(thread_)
+    {}
+    __device__ inline
+    TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& begin_, const iType& end_):
+      start( begin_+threadIdx.y ),
+      end( end_ ),
+      increment( blockDim.y ),
+      thread(thread_)
+    {}
+#else
+    KOKKOS_INLINE_FUNCTION
+    TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
+      start( 0 ),
+      end( count ),
+      increment( 1 ),
+      thread(thread_)
+    {}
+    KOKKOS_INLINE_FUNCTION
+    TeamThreadRangeBoundariesStruct (const CudaTeamMember& thread_,  const iType& begin_, const iType& end_):
+      start( begin_ ),
+      end( end_ ),
+      increment( 1 ),
+      thread(thread_)
+    {}
+#endif
+  };
+
+  template<typename iType>
+  struct ThreadVectorRangeBoundariesStruct<iType,CudaTeamMember> {
+    typedef iType index_type;
+    const iType start;
+    const iType end;
+    const iType increment;
+
+#ifdef __CUDA_ARCH__
+    __device__ inline
+    ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread, const iType& count):
+    start( threadIdx.x ),
+    end( count ),
+    increment( blockDim.x )
+    {}
+#else
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const CudaTeamMember& thread_, const iType& count):
+      start( 0 ),
+      end( count ),
+      increment( 1 )
+    {}
+#endif
+    };
+
+} // namespace Impl
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>
+  TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>
+  TeamThreadRange(const Impl::CudaTeamMember& thread, const iType& begin, const iType& end) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
+  ThreadVectorRange(const Impl::CudaTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::CudaTeamMember> PerTeam(const Impl::CudaTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::CudaTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::CudaTeamMember> PerThread(const Impl::CudaTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::CudaTeamMember>(thread);
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries, const Lambda& lambda) {
+  #ifdef __CUDA_ARCH__
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+  #endif
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+#ifdef __CUDA_ARCH__
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  Impl::cuda_intra_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; });
+  Impl::cuda_inter_warp_reduction(result,[&] (ValueType& dst, const ValueType& src) { dst+=src; });
+
+#endif
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::CudaTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+#ifdef __CUDA_ARCH__
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  Impl::cuda_intra_warp_reduction(result, join );
+  Impl::cuda_inter_warp_reduction(result, join );
+
+  init_result = result;
+#endif
+}
+
+} //namespace Kokkos
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+#ifdef __CUDA_ARCH__
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+#endif
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+#ifdef __CUDA_ARCH__
+  ValueType val = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,val);
+  }
+
+  result = val;
+
+  if (loop_boundaries.increment > 1)
+    result += shfl_down(result, 1,loop_boundaries.increment);
+  if (loop_boundaries.increment > 2)
+    result += shfl_down(result, 2,loop_boundaries.increment);
+  if (loop_boundaries.increment > 4)
+    result += shfl_down(result, 4,loop_boundaries.increment);
+  if (loop_boundaries.increment > 8)
+    result += shfl_down(result, 8,loop_boundaries.increment);
+  if (loop_boundaries.increment > 16)
+    result += shfl_down(result, 16,loop_boundaries.increment);
+
+  result = shfl(result,0,loop_boundaries.increment);
+#endif
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+#ifdef __CUDA_ARCH__
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,result);
+  }
+
+  if (loop_boundaries.increment > 1)
+    join( result, shfl_down(result, 1,loop_boundaries.increment));
+  if (loop_boundaries.increment > 2)
+    join( result, shfl_down(result, 2,loop_boundaries.increment));
+  if (loop_boundaries.increment > 4)
+    join( result, shfl_down(result, 4,loop_boundaries.increment));
+  if (loop_boundaries.increment > 8)
+    join( result, shfl_down(result, 8,loop_boundaries.increment));
+  if (loop_boundaries.increment > 16)
+    join( result, shfl_down(result, 16,loop_boundaries.increment));
+
+  init_result = shfl(result,0,loop_boundaries.increment);
+#endif
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+#ifdef __CUDA_ARCH__
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+  const int VectorLength = blockDim.x;
+
+  iType loop_bound = ((loop_boundaries.end+VectorLength-1)/VectorLength) * VectorLength;
+  for(int _i = threadIdx.x; _i < loop_bound; _i += VectorLength) {
+    value_type val = value_type();
+    if(_i<loop_boundaries.end)
+      lambda(_i , val , false);
+
+    value_type tmp = val;
+    value_type result_i;
+
+    if(threadIdx.x%VectorLength == 0)
+      result_i = tmp;
+    if (VectorLength > 1) {
+      const value_type tmp2 = shfl_up(tmp, 1,VectorLength);
+      if(threadIdx.x > 0)
+        tmp+=tmp2;
+    }
+    if(threadIdx.x%VectorLength == 1)
+      result_i = tmp;
+    if (VectorLength > 3) {
+      const value_type tmp2 = shfl_up(tmp, 2,VectorLength);
+      if(threadIdx.x > 1)
+        tmp+=tmp2;
+    }
+    if ((threadIdx.x%VectorLength >= 2) &&
+        (threadIdx.x%VectorLength < 4))
+      result_i = tmp;
+    if (VectorLength > 7) {
+      const value_type tmp2 = shfl_up(tmp, 4,VectorLength);
+      if(threadIdx.x > 3)
+        tmp+=tmp2;
+    }
+    if ((threadIdx.x%VectorLength >= 4) &&
+        (threadIdx.x%VectorLength < 8))
+      result_i = tmp;
+    if (VectorLength > 15) {
+      const value_type tmp2 = shfl_up(tmp, 8,VectorLength);
+      if(threadIdx.x > 7)
+        tmp+=tmp2;
+    }
+    if ((threadIdx.x%VectorLength >= 8) &&
+        (threadIdx.x%VectorLength < 16))
+      result_i = tmp;
+    if (VectorLength > 31) {
+      const value_type tmp2 = shfl_up(tmp, 16,VectorLength);
+      if(threadIdx.x > 15)
+        tmp+=tmp2;
+    }
+    if (threadIdx.x%VectorLength >= 16)
+      result_i = tmp;
+
+    val = scan_val + result_i - val;
+    scan_val += shfl(tmp,VectorLength-1,VectorLength);
+    if(_i<loop_boundaries.end)
+      lambda(_i , val , true);
+  }
+#endif
+}
+
+}
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0) lambda();
+#endif
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0 && threadIdx.y == 0) lambda();
+#endif
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::CudaTeamMember>& , const FunctorType& lambda, ValueType& val) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0) lambda(val);
+  val = shfl(val,0,blockDim.x);
+#endif
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::CudaTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+#ifdef __CUDA_ARCH__
+  if(threadIdx.x == 0 && threadIdx.y == 0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+#endif
+}
+
+}
+
+namespace Kokkos {
+
+namespace Impl {
+  template< class FunctorType, class ExecPolicy, class ValueType , class Tag = typename ExecPolicy::work_tag>
+  struct CudaFunctorAdapter {
+    const FunctorType f;
+    typedef ValueType value_type;
+    CudaFunctorAdapter(const FunctorType& f_):f(f_) {}
+
+    __device__ inline
+    void operator() (typename ExecPolicy::work_tag, const typename ExecPolicy::member_type& i, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals third argument type of FunctorType::operator()
+      f(typename ExecPolicy::work_tag(), i,val);
+    }
+  };
+
+  template< class FunctorType, class ExecPolicy, class ValueType >
+  struct CudaFunctorAdapter<FunctorType,ExecPolicy,ValueType,void> {
+    const FunctorType f;
+    typedef ValueType value_type;
+    CudaFunctorAdapter(const FunctorType& f_):f(f_) {}
+
+    __device__ inline
+    void operator() (const typename ExecPolicy::member_type& i, ValueType& val) const {
+      //Insert Static Assert with decltype on ValueType equals second argument type of FunctorType::operator()
+      f(i,val);
+    }
+
+  };
+
+  template< class FunctorType, class Enable = void>
+  struct ReduceFunctorHasInit {
+    enum {value = false};
+  };
+
+  template< class FunctorType>
+  struct ReduceFunctorHasInit<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type > {
+    enum {value = true};
+  };
+
+  template< class FunctorType, class Enable = void>
+  struct ReduceFunctorHasJoin {
+    enum {value = false};
+  };
+
+  template< class FunctorType>
+  struct ReduceFunctorHasJoin<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type > {
+    enum {value = true};
+  };
+
+  template< class FunctorType, class Enable = void>
+  struct ReduceFunctorHasFinal {
+    enum {value = false};
+  };
+
+  template< class FunctorType>
+  struct ReduceFunctorHasFinal<FunctorType, typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type > {
+    enum {value = true};
+  };
+
+  template< class FunctorType, bool Enable =
+      ( FunctorDeclaresValueType<FunctorType,void>::value) ||
+      ( ReduceFunctorHasInit<FunctorType>::value  ) ||
+      ( ReduceFunctorHasJoin<FunctorType>::value  ) ||
+      ( ReduceFunctorHasFinal<FunctorType>::value )
+      >
+  struct IsNonTrivialReduceFunctor {
+    enum {value = false};
+  };
+
+  template< class FunctorType>
+  struct IsNonTrivialReduceFunctor<FunctorType, true> {
+    enum {value = true};
+  };
+
+  template<class FunctorType, class ResultType, class Tag, bool Enable = IsNonTrivialReduceFunctor<FunctorType>::value >
+  struct FunctorReferenceType {
+    typedef ResultType& reference_type;
+  };
+
+  template<class FunctorType, class ResultType, class Tag>
+  struct FunctorReferenceType<FunctorType, ResultType, Tag, true> {
+    typedef typename Kokkos::Impl::FunctorValueTraits< FunctorType ,Tag >::reference_type reference_type;
+  };
+
+}
+
+// general policy and view ouput
+template< class ExecPolicy , class FunctorTypeIn , class ViewType >
+inline
+void parallel_reduce( const ExecPolicy  & policy
+                    , const FunctorTypeIn & functor_in
+                    , const ViewType    & result_view
+                    , const std::string& str = "" 
+                    , typename Impl::enable_if<
+                      ( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value &&
+                        Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
+                      )>::type * = 0 )
+{
+  enum {FunctorHasValueType = Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value };
+  typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,typename ViewType::value_type> >::type FunctorType;
+  FunctorType functor = Impl::if_c<FunctorHasValueType,FunctorTypeIn,FunctorType>::select(functor_in,FunctorType(functor_in));
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+    uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , policy , result_view );
+    
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelScan(kpID);
+     }
+#endif
+}
+
+// general policy and pod or array of pod output
+template< class ExecPolicy , class FunctorTypeIn , class ResultType>
+inline
+void parallel_reduce( const ExecPolicy  & policy
+                    , const FunctorTypeIn & functor_in
+                    , ResultType& result_ref
+                    , const std::string& str = "" 
+                    , typename Impl::enable_if<
+                      ( ! Impl::is_view<ResultType>::value &&
+                        ! Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value &&
+                        ! Impl::is_integral< ExecPolicy >::value  &&
+                          Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )>::type * = 0 )
+{
+  typedef typename Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ResultType> FunctorType;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , typename ExecPolicy::work_tag >  ValueOps ;
+
+  // Wrap the result output request in a view to inform the implementation
+  // of the type and memory space.
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view( ValueOps::pointer( result_ref )
+               , 1
+               );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+    uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( FunctorType(functor_in) , policy , result_view );
+    
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelScan(kpID);
+     }
+#endif
+}
+
+// general policy and pod or array of pod output
+template< class ExecPolicy , class FunctorType>
+inline
+void parallel_reduce( const ExecPolicy  & policy
+                    , const FunctorType & functor
+                    , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type result_ref
+                    , const std::string& str = "" 
+                    , typename Impl::enable_if<
+                      (   Impl::IsNonTrivialReduceFunctor<FunctorType>::value &&
+                        ! Impl::is_integral< ExecPolicy >::value  &&
+                          Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )>::type * = 0 )
+{
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , typename ExecPolicy::work_tag >  ValueOps ;
+
+  // Wrap the result output request in a view to inform the implementation
+  // of the type and memory space.
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view( ValueOps::pointer( result_ref )
+               , ValueTraits::value_count( functor )
+               );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+    uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , policy , result_view );
+    
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelScan(kpID);
+     }
+#endif
+}
+
+// integral range policy and view ouput
+template< class FunctorTypeIn , class ViewType >
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorTypeIn & functor_in
+                    , const ViewType    & result_view
+                    , const std::string& str = "" 
+                    , typename Impl::enable_if<( Impl::is_view<ViewType>::value &&
+                                                 Impl::is_same<
+                          typename Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space,
+                          Kokkos::Cuda>::value
+                        )>::type * = 0 )
+{
+  enum {FunctorHasValueType = Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value };
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space
+      execution_space ;
+
+  typedef RangePolicy< execution_space > ExecPolicy ;
+
+  typedef typename Kokkos::Impl::if_c<FunctorHasValueType, FunctorTypeIn, Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,typename ViewType::value_type> >::type FunctorType;
+
+  FunctorType functor = Impl::if_c<FunctorHasValueType,FunctorTypeIn,FunctorType>::select(functor_in,FunctorType(functor_in));
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+    uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( functor , ExecPolicy(0,work_count) , result_view );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelScan(kpID);
+     }
+#endif
+
+}
+
+// integral range policy and pod or array of pod output
+template< class FunctorTypeIn , class ResultType>
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorTypeIn & functor_in
+                    , ResultType& result
+                    , const std::string& str = "" 
+                    , typename Impl::enable_if< ! Impl::is_view<ResultType>::value &&
+                                                ! Impl::IsNonTrivialReduceFunctor<FunctorTypeIn>::value &&
+                                                Impl::is_same<
+                             typename Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space,
+                             Kokkos::Cuda>::value >::type * = 0 )
+{
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorTypeIn , void >::execution_space
+      execution_space ;
+  typedef Kokkos::RangePolicy< execution_space > ExecPolicy ;
+
+  typedef Impl::CudaFunctorAdapter<FunctorTypeIn,ExecPolicy,ResultType> FunctorType;
+
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , void >  ValueOps ;
+
+
+  // Wrap the result output request in a view to inform the implementation
+  // of the type and memory space.
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view( ValueOps::pointer( result )
+               , 1
+               );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+    uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType , ExecPolicy >( FunctorType(functor_in) , ExecPolicy(0,work_count) , result_view );
+    
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelScan(kpID);
+     }
+#endif
+}
+
+template< class FunctorType>
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorType & functor
+                    , typename Kokkos::Impl::FunctorValueTraits< FunctorType , void >::reference_type result
+                    , const std::string& str = "" 
+                    , typename Impl::enable_if< Impl::IsNonTrivialReduceFunctor<FunctorType>::value &&
+                                                Impl::is_same<
+                             typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
+                             Kokkos::Cuda>::value >::type * = 0 )
+{
+
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+  typedef Kokkos::RangePolicy< execution_space > ExecPolicy ;
+
+
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , void >  ValueOps ;
+
+
+  // Wrap the result output request in a view to inform the implementation
+  // of the type and memory space.
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view( ValueOps::pointer( result )
+               , ValueTraits::value_count( functor )
+               );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+    uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType , ExecPolicy >( functor , ExecPolicy(0,work_count) , result_view );
+    
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelScan(kpID);
+     }
+#endif
+}
+
+} // namespace Kokkos
+#endif /* defined( __CUDACC__ ) */
+
+#endif /* #ifndef KOKKOS_CUDA_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..5ef16711eecb006103f32e65d84bd3d310be2719
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp
@@ -0,0 +1,424 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_REDUCESCAN_HPP
+#define KOKKOS_CUDA_REDUCESCAN_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#if defined( __CUDACC__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <utility>
+
+#include <Kokkos_Parallel.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+
+
+//Shfl based reductions
+/*
+ *  Algorithmic constraints:
+ *   (a) threads with same threadIdx.y have same value
+ *   (b) blockDim.x == power of two
+ *   (c) blockDim.z == 1
+ */
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_intra_warp_reduction( ValueType& result,
+                                       const JoinOp& join,
+                                       const int max_active_thread = blockDim.y) {
+
+  unsigned int shift = 1;
+
+  //Reduce over values from threads with different threadIdx.y
+  while(blockDim.x * shift < 32 ) {
+    const ValueType tmp = shfl_down(result, blockDim.x*shift,32u);
+    //Only join if upper thread is active (this allows non power of two for blockDim.y
+    if(threadIdx.y + shift < max_active_thread)
+      join(result , tmp);
+    shift*=2;
+  }
+
+  result = shfl(result,0,32);
+}
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_inter_warp_reduction( ValueType& value,
+                                       const JoinOp& join,
+                                       const int max_active_thread = blockDim.y) {
+
+  #define STEP_WIDTH 4
+  __shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
+  ValueType* result = (ValueType*) & sh_result;
+  const unsigned step = 32 / blockDim.x;
+  unsigned shift = STEP_WIDTH;
+  const int id = threadIdx.y%step==0?threadIdx.y/step:65000;
+  if(id < STEP_WIDTH ) {
+    result[id] = value;
+  }
+  __syncthreads();
+  while (shift<=max_active_thread/step) {
+    if(shift<=id && shift+STEP_WIDTH>id && threadIdx.x==0) {
+      join(result[id%STEP_WIDTH],value);
+    }
+    __syncthreads();
+    shift+=STEP_WIDTH;
+  }
+
+
+  value = result[0];
+  for(int i = 1; (i*step<=max_active_thread) && i<STEP_WIDTH; i++)
+    join(value,result[i]);
+}
+
+template< class ValueType , class JoinOp>
+__device__
+inline void cuda_intra_block_reduction( ValueType& value,
+                                        const JoinOp& join,
+                                        const int max_active_thread = blockDim.y) {
+  cuda_intra_warp_reduction(value,join,max_active_thread);
+  cuda_inter_warp_reduction(value,join,max_active_thread);
+}
+
+template< class FunctorType , class JoinOp>
+__device__
+bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type  value,
+                                 const JoinOp& join,
+                                 Cuda::size_type * const m_scratch_space,
+                                 typename FunctorValueTraits< FunctorType , void >::pointer_type const result,
+                                 Cuda::size_type * const m_scratch_flags,
+                                 const int max_active_thread = blockDim.y) {
+  typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type;
+  typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type;
+
+  //Do the intra-block reduction with shfl operations and static shared memory
+  cuda_intra_block_reduction(value,join,max_active_thread);
+
+  const unsigned id = threadIdx.y*blockDim.x + threadIdx.x;
+
+  //One thread in the block writes block result to global scratch_memory
+  if(id == 0 ) {
+    pointer_type global = ((pointer_type) m_scratch_space) + blockIdx.x;
+    *global = value;
+  }
+
+  //One warp of last block performs inter block reduction through loading the block values from global scratch_memory
+  bool last_block = false;
+
+  __syncthreads();
+  if ( id < 32 ) {
+    Cuda::size_type count;
+
+    //Figure out whether this is the last block
+    if(id == 0)
+      count = Kokkos::atomic_fetch_add(m_scratch_flags,1);
+    count = Kokkos::shfl(count,0,32);
+
+    //Last block does the inter block reduction
+    if( count == gridDim.x - 1) {
+      //set flag back to zero
+      if(id == 0)
+        *m_scratch_flags = 0;
+      last_block = true;
+      value = 0;
+
+      pointer_type const volatile global = (pointer_type) m_scratch_space ;
+
+      //Reduce all global values with splitting work over threads in one warp
+      const int step_size = blockDim.x*blockDim.y < 32 ? blockDim.x*blockDim.y : 32;
+      for(int i=id; i<gridDim.x; i+=step_size) {
+        value_type tmp = global[i];
+        join(value, tmp);
+      }
+
+      //Perform shfl reductions within the warp only join if contribution is valid (allows gridDim.x non power of two and <32)
+      if (blockDim.x*blockDim.y > 1) {
+        value_type tmp = Kokkos::shfl_down(value, 1,32);
+        if( id + 1 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 2) {
+        value_type tmp = Kokkos::shfl_down(value, 2,32);
+        if( id + 2 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 4) {
+        value_type tmp = Kokkos::shfl_down(value, 4,32);
+        if( id + 4 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 8) {
+        value_type tmp = Kokkos::shfl_down(value, 8,32);
+        if( id + 8 < gridDim.x )
+          join(value, tmp);
+      }
+      if (blockDim.x*blockDim.y > 16) {
+        value_type tmp = Kokkos::shfl_down(value, 16,32);
+        if( id + 16 < gridDim.x )
+          join(value, tmp);
+      }
+    }
+  }
+
+  //The last block has in its thread=0 the global reduction value through "value"
+  return last_block;
+}
+
+//----------------------------------------------------------------------------
+// See section B.17 of Cuda C Programming Guide Version 3.2
+// for discussion of
+//   __launch_bounds__(maxThreadsPerBlock,minBlocksPerMultiprocessor)
+// function qualifier which could be used to improve performance.
+//----------------------------------------------------------------------------
+// Maximize shared memory and minimize L1 cache:
+//   cudaFuncSetCacheConfig(MyKernel, cudaFuncCachePreferShared );
+// For 2.0 capability: 48 KB shared and 16 KB L1
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/*
+ *  Algorithmic constraints:
+ *   (a) blockDim.y is a power of two
+ *   (b) blockDim.y <= 512
+ *   (c) blockDim.x == blockDim.z == 1
+ */
+
+template< bool DoScan , class FunctorType , class ArgTag >
+__device__
+void cuda_intra_block_reduce_scan( const FunctorType & functor ,
+                                   const typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type base_data )
+{
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type  pointer_type ;
+
+  const unsigned value_count   = ValueTraits::value_count( functor );
+  const unsigned BlockSizeMask = blockDim.y - 1 ;
+
+  // Must have power of two thread count
+
+  if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_intra_block_scan requires power-of-two blockDim"); }
+
+#define BLOCK_REDUCE_STEP( R , TD , S )  \
+  if ( ! ( R & ((1<<(S+1))-1) ) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S)) ); }
+
+#define BLOCK_SCAN_STEP( TD , N , S )  \
+  if ( N == (1<<S) ) { ValueJoin::join( functor , TD , (TD - (value_count<<S))); }
+
+  const unsigned     rtid_intra = threadIdx.y ^ BlockSizeMask ;
+  const pointer_type tdata_intra = base_data + value_count * threadIdx.y ;
+
+  { // Intra-warp reduction:
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,0)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,1)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,2)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,3)
+    BLOCK_REDUCE_STEP(rtid_intra,tdata_intra,4)
+  }
+
+  __syncthreads(); // Wait for all warps to reduce
+
+  { // Inter-warp reduce-scan by a single warp to avoid extra synchronizations
+    const unsigned rtid_inter = ( threadIdx.y ^ BlockSizeMask ) << CudaTraits::WarpIndexShift ;
+
+    if ( rtid_inter < blockDim.y ) {
+
+      const pointer_type tdata_inter = base_data + value_count * ( rtid_inter ^ BlockSizeMask );
+
+      if ( (1<<5) < BlockSizeMask ) {                        BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,5) }
+      if ( (1<<6) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,6) }
+      if ( (1<<7) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,7) }
+      if ( (1<<8) < BlockSizeMask ) { __threadfence_block(); BLOCK_REDUCE_STEP(rtid_inter,tdata_inter,8) }
+
+      if ( DoScan ) {
+
+        int n = ( rtid_inter &  32 ) ?  32 : (
+                ( rtid_inter &  64 ) ?  64 : (
+                ( rtid_inter & 128 ) ? 128 : (
+                ( rtid_inter & 256 ) ? 256 : 0 )));
+
+        if ( ! ( rtid_inter + n < blockDim.y ) ) n = 0 ;
+
+        BLOCK_SCAN_STEP(tdata_inter,n,8)
+        BLOCK_SCAN_STEP(tdata_inter,n,7)
+        BLOCK_SCAN_STEP(tdata_inter,n,6)
+        BLOCK_SCAN_STEP(tdata_inter,n,5)
+      }
+    }
+  }
+
+  __syncthreads(); // Wait for inter-warp reduce-scan to complete
+
+  if ( DoScan ) {
+    int n = ( rtid_intra &  1 ) ?  1 : (
+            ( rtid_intra &  2 ) ?  2 : (
+            ( rtid_intra &  4 ) ?  4 : (
+            ( rtid_intra &  8 ) ?  8 : (
+            ( rtid_intra & 16 ) ? 16 : 0 ))));
+
+    if ( ! ( rtid_intra + n < blockDim.y ) ) n = 0 ;
+
+    BLOCK_SCAN_STEP(tdata_intra,n,4) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,3) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,2) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,1) __threadfence_block();
+    BLOCK_SCAN_STEP(tdata_intra,n,0)
+  }
+
+#undef BLOCK_SCAN_STEP
+#undef BLOCK_REDUCE_STEP
+}
+
+//----------------------------------------------------------------------------
+/**\brief  Input value-per-thread starting at 'shared_data'.
+ *         Reduction value at last thread's location.
+ *
+ *  If 'DoScan' then write blocks' scan values and block-groups' scan values.
+ *
+ *  Global reduce result is in the last threads' 'shared_data' location.
+ */
+template< bool DoScan , class FunctorType , class ArgTag >
+__device__
+bool cuda_single_inter_block_reduce_scan( const FunctorType     & functor ,
+                                          const Cuda::size_type   block_id ,
+                                          const Cuda::size_type   block_count ,
+                                          Cuda::size_type * const shared_data ,
+                                          Cuda::size_type * const global_data ,
+                                          Cuda::size_type * const global_flags )
+{
+  typedef Cuda::size_type                  size_type ;
+  typedef FunctorValueTraits< FunctorType , ArgTag >  ValueTraits ;
+  typedef FunctorValueJoin<   FunctorType , ArgTag >  ValueJoin ;
+  typedef FunctorValueInit<   FunctorType , ArgTag >  ValueInit ;
+  typedef FunctorValueOps<    FunctorType , ArgTag >  ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const unsigned BlockSizeMask  = blockDim.y - 1 ;
+  const unsigned BlockSizeShift = power_of_two_if_valid( blockDim.y );
+
+  // Must have power of two thread count
+  if ( BlockSizeMask & blockDim.y ) { Kokkos::abort("Cuda::cuda_single_inter_block_reduce_scan requires power-of-two blockDim"); }
+
+  const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
+    word_count( ValueTraits::value_size( functor ) / sizeof(size_type) );
+
+  // Reduce the accumulation for the entire block.
+  cuda_intra_block_reduce_scan<false,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+  {
+    // Write accumulation total to global scratch space.
+    // Accumulation total is the last thread's data.
+    size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
+    size_type * const global = global_data + word_count.value * block_id ;
+
+    for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
+  }
+
+  // Contributing blocks note that their contribution has been completed via an atomic-increment flag
+  // If this block is not the last block to contribute to this group then the block is done.
+  const bool is_last_block =
+    ! __syncthreads_or( threadIdx.y ? 0 : ( 1 + atomicInc( global_flags , block_count - 1 ) < block_count ) );
+
+  if ( is_last_block ) {
+
+    const size_type b = ( long(block_count) * long(threadIdx.y) ) >> BlockSizeShift ;
+    const size_type e = ( long(block_count) * long( threadIdx.y + 1 ) ) >> BlockSizeShift ;
+
+    {
+      void * const shared_ptr = shared_data + word_count.value * threadIdx.y ;
+      reference_type shared_value = ValueInit::init( functor , shared_ptr );
+
+      for ( size_type i = b ; i < e ; ++i ) {
+        ValueJoin::join( functor , shared_ptr , global_data + word_count.value * i );
+      }
+    }
+
+    cuda_intra_block_reduce_scan<DoScan,FunctorType,ArgTag>( functor , pointer_type(shared_data) );
+
+    if ( DoScan ) {
+
+      size_type * const shared_value = shared_data + word_count.value * ( threadIdx.y ? threadIdx.y - 1 : blockDim.y );
+
+      if ( ! threadIdx.y ) { ValueInit::init( functor , shared_value ); }
+
+      // Join previous inclusive scan value to each member
+      for ( size_type i = b ; i < e ; ++i ) {
+        size_type * const global_value = global_data + word_count.value * i ;
+        ValueJoin::join( functor , shared_value , global_value );
+        ValueOps ::copy( functor , global_value , shared_value );
+      }
+    }
+  }
+
+  return is_last_block ;
+}
+
+// Size in bytes required for inter block reduce or scan
+template< bool DoScan , class FunctorType , class ArgTag >
+inline
+unsigned cuda_single_inter_block_reduce_scan_shmem( const FunctorType & functor , const unsigned BlockSize )
+{
+  return ( BlockSize + 2 ) * Impl::FunctorValueTraits< FunctorType , ArgTag >::value_size( functor );
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( __CUDACC__ ) */
+#endif /* KOKKOS_CUDA_REDUCESCAN_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..0b8427cbe1e9664a41b6bb8b33b21320ad613d78
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp
@@ -0,0 +1,298 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_CUDA_VECTORIZATION_HPP
+#define KOKKOS_CUDA_VECTORIZATION_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <Kokkos_Cuda.hpp>
+
+namespace Kokkos {
+
+
+// Shuffle only makes sense on >= Kepler GPUs; it doesn't work on CPUs
+// or other GPUs.  We provide a generic definition (which is trivial
+// and doesn't do what it claims to do) because we don't actually use
+// this function unless we are on a suitable GPU, with a suitable
+// Scalar type.  (For example, in the mat-vec, the "ThreadsPerRow"
+// internal parameter depends both on the ExecutionSpace and the Scalar type,
+// and it controls whether shfl_down() gets called.)
+namespace Impl {
+
+  template< typename Scalar >
+  struct shfl_union {
+    enum {n = sizeof(Scalar)/4};
+    float fval[n];
+    KOKKOS_INLINE_FUNCTION
+    Scalar value() {
+      return *(Scalar*) fval;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator= (const Scalar& value_) {
+      float* const val_ptr = (float*) &value_;
+      for(int i=0; i<n ; i++) {
+        fval[i] = val_ptr[i];
+      }
+    }
+
+  };
+}
+
+#ifdef __CUDA_ARCH__
+  #if (__CUDA_ARCH__ >= 300)
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl(const int &val, const int& srcLane, const int& width ) {
+      return __shfl(val,srcLane,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl(const float &val, const int& srcLane, const int& width ) {
+      return __shfl(val,srcLane,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type& width
+        ) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl(tmp,srcLane,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl(const double &val, const int& srcLane, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl(lo,srcLane,width);
+      hi = __shfl(hi,srcLane,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl(lo,srcLane,width);
+      hi = __shfl(hi,srcLane,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) > 8) ,int>::type& width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl(s_val.fval[i],srcLane,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_down(const int &val, const int& delta, const int& width) {
+      return __shfl_down(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_down(const float &val, const int& delta, const int& width) {
+      return __shfl_down(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl_down(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_down(const double &val, const int& delta, const int& width) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl_down(lo,delta,width);
+      hi = __shfl_down(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl_down(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    int shfl_up(const int &val, const int& delta, const int& width ) {
+      return __shfl_up(val,delta,width);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    float shfl_up(const float &val, const int& delta, const int& width ) {
+      return __shfl_up(val,delta,width);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) {
+      Scalar tmp1 = val;
+      float tmp = *reinterpret_cast<float*>(&tmp1);
+      tmp = __shfl_up(tmp,delta,width);
+      return *reinterpret_cast<Scalar*>(&tmp);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    double shfl_up(const double &val, const int& delta, const int& width ) {
+      int lo = __double2loint(val);
+      int hi = __double2hiint(val);
+      lo = __shfl_up(lo,delta,width);
+      hi = __shfl_up(hi,delta,width);
+      return __hiloint2double(hi,lo);
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) {
+      int lo = __double2loint(*reinterpret_cast<const double*>(&val));
+      int hi = __double2hiint(*reinterpret_cast<const double*>(&val));
+      lo = __shfl_up(lo,delta,width);
+      hi = __shfl_up(hi,delta,width);
+      const double tmp = __hiloint2double(hi,lo);
+      return *(reinterpret_cast<const Scalar*>(&tmp));
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) > 8) , int >::type & width) {
+      Impl::shfl_union<Scalar> s_val;
+      Impl::shfl_union<Scalar> r_val;
+      s_val = val;
+
+      for(int i = 0; i<s_val.n; i++)
+        r_val.fval[i] = __shfl_up(s_val.fval[i],delta,width);
+      return r_val.value();
+    }
+
+  #else
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    KOKKOS_INLINE_FUNCTION
+    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+  #endif
+#else
+    template<typename Scalar>
+    inline
+    Scalar shfl(const Scalar &val, const int& srcLane, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_down(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+
+    template<typename Scalar>
+    inline
+    Scalar shfl_up(const Scalar &val, const int& delta, const int& width) {
+      if(width > 1) Kokkos::abort("Error: calling shfl_down from a device with CC<3.0.");
+      return val;
+    }
+#endif
+
+
+
+}
+
+#endif // KOKKOS_HAVE_CUDA
+#endif
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..a78ead0cbace7b5a8a76d80ae905c7311bcecb26
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp
@@ -0,0 +1,312 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_VIEW_HPP
+#define KOKKOS_CUDA_VIEW_HPP
+
+#include <Kokkos_Macros.hpp>
+
+/* only compile this file if CUDA is enabled for Kokkos */
+#ifdef KOKKOS_HAVE_CUDA
+
+#include <cstring>
+
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_CudaSpace.hpp>
+#include <Kokkos_View.hpp>
+
+#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct AssertShapeBoundsAbort< CudaSpace >
+{
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t /* rank */ ,
+                     const size_t /* n0 */ , const size_t /* n1 */ ,
+                     const size_t /* n2 */ , const size_t /* n3 */ ,
+                     const size_t /* n4 */ , const size_t /* n5 */ ,
+                     const size_t /* n6 */ , const size_t /* n7 */ ,
+
+                     const size_t /* arg_rank */ ,
+                     const size_t /* i0 */ , const size_t /* i1 */ ,
+                     const size_t /* i2 */ , const size_t /* i3 */ ,
+                     const size_t /* i4 */ , const size_t /* i5 */ ,
+                     const size_t /* i6 */ , const size_t /* i7 */ )
+    {
+      Kokkos::abort("Kokkos::View array bounds violation");
+    }
+};
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
+// Via reinterpret_case this can be used to support all scalar types of those sizes.
+// Any other scalar type falls back to either normal reads out of global memory,
+// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
+
+template< typename ValueType
+        , class MemorySpace
+        , class AliasType =
+            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  4 ) , int ,
+            typename Kokkos::Impl::if_c< ( sizeof(ValueType) ==  8 ) , ::int2 ,
+            typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 , void
+            >::type
+            >::type
+            >::type
+        >
+class CudaTextureFetch {
+private:
+
+  cuda_texture_object_type  m_obj ;
+  const ValueType         * m_alloc_ptr ;
+  int                       m_offset ;
+
+  void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
+  {
+    typedef char const * const byte;
+
+    m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
+
+    size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
+    const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
+
+    const size_t count = tracker.alloc_size() / sizeof(ValueType);
+    const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
+
+    if (ok_aligned && ok_contains) {
+      if (tracker.attribute() == NULL ) {
+        MemorySpace::texture_object_attach(
+            tracker
+            , sizeof(ValueType)
+            , cudaCreateChannelDesc< AliasType >()
+            );
+      }
+      m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
+      m_offset = arg_ptr - m_alloc_ptr;
+    }
+    else if( !ok_contains ) {
+      throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
+    }
+    else {
+      throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
+    }
+  }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaTextureFetch() {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs )
+    : m_obj(       rhs.m_obj )
+    , m_alloc_ptr( rhs.m_alloc_ptr )
+    , m_offset(    rhs.m_offset )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
+    {
+      m_obj       = rhs.m_obj ;
+      m_alloc_ptr = rhs.m_alloc_ptr ;
+      m_offset    = rhs.m_offset ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION explicit
+  CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
+    : m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
+    {
+      #if defined( KOKKOS_USE_LDG_INTRINSIC )
+        m_alloc_ptr(arg_ptr);
+      #elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
+        if ( arg_ptr != NULL ) {
+          if ( tracker.is_valid() ) {
+            attach( arg_ptr, tracker );
+          }
+          else {
+            AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
+            if ( found_tracker.is_valid() ) {
+              attach( arg_ptr, found_tracker );
+            } else {
+              throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
+            }
+          }
+        }
+      #endif
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
+
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+    {
+      #if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+        AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
+        return  *(reinterpret_cast<ValueType*> (&v));
+      #elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
+        AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
+        return  *(reinterpret_cast<ValueType*> (&v));
+      #else
+        return m_alloc_ptr[ i + m_offset ];
+      #endif
+  }
+};
+
+template< typename ValueType, class MemorySpace >
+class CudaTextureFetch< const ValueType, MemorySpace, void >
+{
+private:
+  const ValueType * m_ptr ;
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch() : m_ptr(0) {};
+
+  KOKKOS_INLINE_FUNCTION
+  ~CudaTextureFetch() {
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {}
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
+    m_ptr = rhs.m_ptr;
+    return *this ;
+  }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) {
+    m_ptr = base_view_ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
+    m_ptr = base_view_ptr;
+    return *this;
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  operator const ValueType * () const { return m_ptr ; }
+
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  ValueType operator[]( const iType & i ) const
+  {
+    return m_ptr[ i ];
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Replace Default ViewDataHandle with Cuda texture fetch specialization
+ *          if 'const' value type, CudaSpace and random access.
+ */
+template< class ViewTraits >
+class ViewDataHandle< ViewTraits ,
+  typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value ||
+                        is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value )
+                      &&
+                      is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value
+                      &&
+                      ViewTraits::memory_traits::RandomAccess
+                    >::type >
+{
+public:
+  enum { ReturnTypeIsReference = false };
+
+  typedef Impl::CudaTextureFetch< typename ViewTraits::value_type
+                                , typename ViewTraits::memory_space> handle_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker )
+  {
+    return handle_type(arg_data_ptr, arg_tracker);
+  }
+
+  typedef typename ViewTraits::value_type return_type;
+};
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif // KOKKOS_HAVE_CUDA
+#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */
+
diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..deb955ccd4755d43a24469171f2689d8c2a87dae
--- /dev/null
+++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_abort.hpp
@@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_ABORT_HPP
+#define KOKKOS_CUDA_ABORT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+#include "Kokkos_Macros.hpp"
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+
+#include <cuda.h>
+
+#if ! defined( CUDA_VERSION ) || ( CUDA_VERSION < 4010 )
+#error "Cuda version 4.1 or greater required"
+#endif
+
+#if ( __CUDA_ARCH__ < 200 )
+#error "Cuda device capability 2.0 or greater required"
+#endif
+
+extern "C" {
+/*  Cuda runtime function, declared in <crt/device_runtime.h>
+ *  Requires capability 2.x or better.
+ */
+extern __device__ void __assertfail(
+  const void  *message,
+  const void  *file,
+  unsigned int line,
+  const void  *function,
+  size_t       charsize);
+}
+
+namespace Kokkos {
+namespace Impl {
+
+__device__ inline
+void cuda_abort( const char * const message )
+{
+#ifndef __APPLE__
+  const char empty[] = "" ;
+
+  __assertfail( (const void *) message ,
+                (const void *) empty ,
+                (unsigned int) 0 ,
+                (const void *) empty ,
+                sizeof(char) );
+#endif
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#else
+
+namespace Kokkos {
+namespace Impl {
+KOKKOS_INLINE_FUNCTION
+void cuda_abort( const char * const ) {}
+}
+}
+
+#endif /* #if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA )
+namespace Kokkos {
+__device__ inline
+void abort( const char * const message ) { Kokkos::Impl::cuda_abort(message); }
+}
+#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_CUDA_ABORT_HPP */
+
diff --git a/lib/kokkos/core/src/KokkosExp_View.hpp b/lib/kokkos/core/src/KokkosExp_View.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..a2226f3de0562cacc88311ac001bf4c9b5d710fc
--- /dev/null
+++ b/lib/kokkos/core/src/KokkosExp_View.hpp
@@ -0,0 +1,1945 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_VIEW_HPP
+#define KOKKOS_EXPERIMENTAL_VIEW_HPP
+
+#include <string>
+#include <type_traits>
+#include <initializer_list>
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class > struct ViewDataAnalysis ;
+
+template< class , class = void , typename Enable = void >
+class ViewMapping { enum { is_assignable = false }; };
+
+template< class DstMemorySpace , class SrcMemorySpace >
+struct DeepCopy ;
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \class ViewTraits
+ *  \brief Traits class for accessing attributes of a View.
+ *
+ * This is an implementation detail of View.  It is only of interest
+ * to developers implementing a new specialization of View.
+ *
+ * Template argument permutations:
+ *   - View< DataType , void         , void         , void >
+ *   - View< DataType , Space        , void         , void >
+ *   - View< DataType , Space        , MemoryTraits , void >
+ *   - View< DataType , Space        , void         , MemoryTraits >
+ *   - View< DataType , ArrayLayout  , void         , void >
+ *   - View< DataType , ArrayLayout  , Space        , void >
+ *   - View< DataType , ArrayLayout  , MemoryTraits , void   >
+ *   - View< DataType , ArrayLayout  , Space        , MemoryTraits >
+ *   - View< DataType , MemoryTraits , void         , void  >
+ */
+
+template< class DataType ,
+          class Arg1 = void ,
+          class Arg2 = void ,
+          class Arg3 = void >
+class ViewTraits {
+private:
+
+  // Layout, Space, and MemoryTraits are optional
+  // but need to appear in that order. That means Layout
+  // can only be Arg1, Space can be Arg1 or Arg2, and
+  // MemoryTraits can be Arg1, Arg2 or Arg3
+
+  enum { Arg1IsLayout = Kokkos::Impl::is_array_layout<Arg1>::value };
+
+  enum { Arg1IsSpace = Kokkos::Impl::is_space<Arg1>::value };
+  enum { Arg2IsSpace = Kokkos::Impl::is_space<Arg2>::value };
+
+  enum { Arg1IsMemoryTraits = Kokkos::Impl::is_memory_traits<Arg1>::value };
+  enum { Arg2IsMemoryTraits = Kokkos::Impl::is_memory_traits<Arg2>::value };
+  enum { Arg3IsMemoryTraits = Kokkos::Impl::is_memory_traits<Arg3>::value };
+
+  enum { Arg1IsVoid = std::is_same< Arg1 , void >::value };
+  enum { Arg2IsVoid = std::is_same< Arg2 , void >::value };
+  enum { Arg3IsVoid = std::is_same< Arg3 , void >::value };
+
+  static_assert( 1 == Arg1IsLayout + Arg1IsSpace + Arg1IsMemoryTraits + Arg1IsVoid
+               , "Template argument #1 must be layout, space, traits, or void" );
+
+  // If Arg1 is Layout       then Arg2 is Space, MemoryTraits, or void
+  // If Arg1 is Space        then Arg2 is MemoryTraits or void
+  // If Arg1 is MemoryTraits then Arg2 is void
+  // If Arg1 is Void         then Arg2 is void
+
+  static_assert( ( Arg1IsLayout       && ( 1 == Arg2IsSpace + Arg2IsMemoryTraits + Arg2IsVoid ) ) ||
+                 ( Arg1IsSpace        && ( 0 == Arg2IsSpace ) && ( 1 == Arg2IsMemoryTraits + Arg2IsVoid ) ) ||
+                 ( Arg1IsMemoryTraits && Arg2IsVoid ) ||
+                 ( Arg1IsVoid         && Arg2IsVoid )
+               , "Template argument #2 must be space, traits, or void" );
+
+  // Arg3 is MemoryTraits or void and at most one argument is MemoryTraits
+  static_assert( ( 1 == Arg3IsMemoryTraits + Arg3IsVoid ) &&
+                 ( Arg1IsMemoryTraits + Arg2IsMemoryTraits + Arg3IsMemoryTraits <= 1 )
+               , "Template argument #3 must be traits or void" );
+
+  typedef
+    typename std::conditional< Arg1IsSpace , Arg1 ,
+    typename std::conditional< Arg2IsSpace , Arg2 , Kokkos::DefaultExecutionSpace
+    >::type >::type::execution_space
+     ExecutionSpace ;
+
+  typedef 
+    typename std::conditional< Arg1IsSpace , Arg1 ,
+    typename std::conditional< Arg2IsSpace , Arg2 , Kokkos::DefaultExecutionSpace
+    >::type >::type::memory_space 
+      MemorySpace ;
+
+  typedef
+    typename Kokkos::Impl::is_space<
+    typename std::conditional< Arg1IsSpace , Arg1 ,
+    typename std::conditional< Arg2IsSpace , Arg2 , Kokkos::DefaultExecutionSpace
+    >::type >::type >::host_mirror_space
+      HostMirrorSpace ;
+
+  typedef
+    typename std::conditional< Arg1IsLayout , Arg1 , typename ExecutionSpace::array_layout >::type
+      ArrayLayout ;
+
+  // Arg1, Arg2, or Arg3 may be memory traits
+  typedef
+    typename std::conditional< Arg1IsMemoryTraits , Arg1 ,
+    typename std::conditional< Arg2IsMemoryTraits , Arg2 ,
+    typename std::conditional< Arg3IsMemoryTraits , Arg3 , MemoryManaged
+    >::type >::type >::type
+      MemoryTraits ;
+
+  typedef Kokkos::Experimental::Impl::ViewDataAnalysis< DataType >  analysis ;
+
+public:
+
+  //------------------------------------
+  // Data type traits:
+
+  typedef typename analysis::type            data_type ;
+  typedef typename analysis::const_type      const_data_type ;
+  typedef typename analysis::non_const_type  non_const_data_type ;
+
+  //------------------------------------
+  // Compatible array of trivial type traits:
+
+  typedef typename analysis::array_scalar_type            array_scalar_type ;
+  typedef typename analysis::const_array_scalar_type      const_array_scalar_type ;
+  typedef typename analysis::non_const_array_scalar_type  non_const_array_scalar_type ;
+
+  //------------------------------------
+  // Value type traits:
+
+  typedef typename analysis::value_type            value_type ;
+  typedef typename analysis::const_value_type      const_value_type ;
+  typedef typename analysis::non_const_value_type  non_const_value_type ;
+
+  //------------------------------------
+  // Mapping traits:
+
+  typedef ArrayLayout                    array_layout ;
+  typedef typename analysis::dimension   dimension ;
+  typedef typename analysis::specialize  specialize /* mapping specialization tag */ ;
+
+  enum { rank         = dimension::rank };
+  enum { rank_dynamic = dimension::rank_dynamic };
+
+  //------------------------------------
+  // Execution space, memory space, memory access traits, and host mirror space.
+
+  typedef ExecutionSpace                      execution_space ;
+  typedef MemorySpace                         memory_space ;
+  typedef Device<ExecutionSpace,MemorySpace>  device_type ;
+  typedef MemoryTraits                        memory_traits ;
+  typedef HostMirrorSpace                     host_mirror_space ;
+
+  typedef typename memory_space::size_type  size_type ;
+
+  enum { is_hostspace      = std::is_same< memory_space , HostSpace >::value };
+  enum { is_managed        = memory_traits::Unmanaged    == 0 };
+  enum { is_random_access  = memory_traits::RandomAccess == 1 };
+
+  //------------------------------------
+};
+
+/** \class View
+ *  \brief View to an array of data.
+ *
+ * A View represents an array of one or more dimensions.
+ * For details, please refer to Kokkos' tutorial materials.
+ *
+ * \section Kokkos_View_TemplateParameters Template parameters
+ *
+ * This class has both required and optional template parameters.  The
+ * \c DataType parameter must always be provided, and must always be
+ * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are
+ * placeholders for different template parameters.  The default value
+ * of the fifth template parameter \c Specialize suffices for most use
+ * cases.  When explaining the template parameters, we won't refer to
+ * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer
+ * to the valid categories of template parameters, in whatever order
+ * they may occur.
+ *
+ * Valid ways in which template arguments may be specified:
+ *   - View< DataType , Space >
+ *   - View< DataType , Space  ,         MemoryTraits >
+ *   - View< DataType , Space  , void  , MemoryTraits >
+ *   - View< DataType , Layout , Space >
+ *   - View< DataType , Layout , Space , MemoryTraits >
+ *
+ * \tparam DataType (required) This indicates both the type of each
+ *   entry of the array, and the combination of compile-time and
+ *   run-time array dimension(s).  For example, <tt>double*</tt>
+ *   indicates a one-dimensional array of \c double with run-time
+ *   dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int
+ *   with run-time first dimension and compile-time second dimension
+ *   (of 3).  In general, the run-time dimensions (if any) must go
+ *   first, followed by zero or more compile-time dimensions.  For
+ *   more examples, please refer to the tutorial materials.
+ *
+ * \tparam Space (required) The memory space.
+ *
+ * \tparam Layout (optional) The array's layout in memory.  For
+ *   example, LayoutLeft indicates a column-major (Fortran style)
+ *   layout, and LayoutRight a row-major (C style) layout.  If not
+ *   specified, this defaults to the preferred layout for the
+ *   <tt>Space</tt>.
+ *
+ * \tparam MemoryTraits (optional) Assertion of the user's intended
+ *   access behavior.  For example, RandomAccess indicates read-only
+ *   access with limited spatial locality, and Unmanaged lets users
+ *   wrap externally allocated memory in a View without automatic
+ *   deallocation.
+ *
+ * \section Kokkos_View_MT MemoryTraits discussion
+ *
+ * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on Space
+ *
+ * Some \c MemoryTraits options may have different interpretations for
+ * different \c Space types.  For example, with the Cuda device,
+ * \c RandomAccess tells Kokkos to fetch the data through the texture
+ * cache, whereas the non-GPU devices have no such hardware construct.
+ *
+ * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits
+ *
+ * Users should defer applying the optional \c MemoryTraits parameter
+ * until the point at which they actually plan to rely on it in a
+ * computational kernel.  This minimizes the number of template
+ * parameters exposed in their code, which reduces the cost of
+ * compilation.  Users may always assign a View without specified
+ * \c MemoryTraits to a compatible View with that specification.
+ * For example:
+ * \code
+ * // Pass in the simplest types of View possible.
+ * void
+ * doSomething (View<double*, Cuda> out,
+ *              View<const double*, Cuda> in)
+ * {
+ *   // Assign the "generic" View in to a RandomAccess View in_rr.
+ *   // Note that RandomAccess View objects must have const data.
+ *   View<const double*, Cuda, RandomAccess> in_rr = in;
+ *   // ... do something with in_rr and out ...
+ * }
+ * \endcode
+ */
+template< class DataType
+        , class Arg1 = void /* ArrayLayout, SpaceType, or MemoryTraits */
+        , class Arg2 = void /* SpaceType or MemoryTraits */
+        , class Arg3 = void /* MemoryTraits */ >
+class View ;
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/KokkosExp_ViewMapping.hpp>
+#include <impl/KokkosExp_ViewAllocProp.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+namespace {
+
+constexpr Kokkos::Experimental::Impl::ALL_t
+  ALL = Kokkos::Experimental::Impl::ALL_t();
+
+constexpr Kokkos::Experimental::Impl::WithoutInitializing_t
+  WithoutInitializing = Kokkos::Experimental::Impl::WithoutInitializing_t();
+
+constexpr Kokkos::Experimental::Impl::AllowPadding_t       
+  AllowPadding        = Kokkos::Experimental::Impl::AllowPadding_t();
+
+}
+
+/** \brief  Create View allocation parameter bundle from argument list.
+ *
+ *  Valid argument list members are:
+ *    1) label as a "string" or std::string
+ *    2) memory space instance of the View::memory_space type
+ *    3) execution space instance compatible with the View::memory_space
+ *    4) Kokkos::WithoutInitializing to bypass initialization
+ *    4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory alignment
+ */
+template< class ... Args >
+inline
+Kokkos::Experimental::Impl::ViewAllocProp< Args ... >
+view_alloc( Args ... args )
+{
+  return Kokkos::Experimental::Impl::ViewAllocProp< Args ... >( args ... );
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/**\brief  Each R? template argument designates whether the subview argument is a range */
+template< class V
+        , bool R0 = false , bool R1 = false , bool R2 = false , bool R3 = false
+        , bool R4 = false , bool R5 = false , bool R6 = false , bool R7 = false >
+using Subview = typename Kokkos::Experimental::Impl::SubviewType< V, R0 , R1 , R2 , R3 , R4 , R5 , R6 , R7 >::type ;
+
+template< class DataType , class Arg1 , class Arg2 , class Arg3 >
+class View : public ViewTraits< DataType , Arg1 , Arg2 , Arg3 > {
+private:
+
+  template< class , class , class , class > friend class View ;
+
+  typedef ViewTraits< DataType , Arg1 , Arg2 , Arg3 >          traits ;
+  typedef Kokkos::Experimental::Impl::ViewMapping< traits >    map_type ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker  track_type ;
+
+  track_type  m_track ;
+  map_type    m_map ;
+
+public:
+
+  //----------------------------------------
+  /** \brief  Compatible view of array of scalar types */
+  typedef View< typename traits::array_scalar_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > 
+    array_type ;
+
+  /** \brief  Compatible view of const data type */
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > 
+    const_type ;
+
+  /** \brief  Compatible view of non-const data type */
+  typedef View< typename traits::non_const_data_type ,
+                    typename traits::array_layout ,
+                    typename traits::device_type ,
+                    typename traits::memory_traits > 
+    non_const_type ;
+
+  /** \brief  Compatible HostMirror view */
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::host_mirror_space ,
+                void >
+    HostMirror ;
+
+  //----------------------------------------
+  // Domain dimensions
+
+  enum { Rank = map_type::Rank };
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_map.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_map.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_map.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_map.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_map.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_map.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_map.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_map.dimension_7(); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); }
+
+  //----------------------------------------
+  // Range span
+
+  typedef typename map_type::reference_type  reference_type ;
+
+  enum { reference_type_is_lvalue_reference = std::is_lvalue_reference< reference_type >::value };
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); }
+  KOKKOS_INLINE_FUNCTION constexpr bool   span_is_contiguous() const { return m_map.span_is_contiguous(); }
+  KOKKOS_INLINE_FUNCTION constexpr typename traits::value_type * data() const { return m_map.data(); }
+
+  // Deprecated, use 'span_is_contigous()' instead
+  KOKKOS_INLINE_FUNCTION constexpr bool   is_contiguous() const { return m_map.span_is_contiguous(); }
+  // Deprecated, use 'data()' instead
+  KOKKOS_INLINE_FUNCTION constexpr typename traits::value_type * ptr_on_device() const { return m_map.data(); }
+
+  //----------------------------------------
+
+private:
+
+  typedef typename
+    std::conditional< Rank == 0 , reference_type
+      , Kokkos::Experimental::Impl::Error_view_scalar_reference_to_non_scalar_view >::type 
+    scalar_operator_reference_type ;
+
+  typedef typename
+    std::conditional< Rank == 0 , const int
+      , Kokkos::Experimental::Impl::Error_view_scalar_reference_to_non_scalar_view >::type 
+    scalar_operator_index_type ;
+
+public:
+
+  // Rank == 0
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  scalar_operator_reference_type operator()() const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, 0, 0, 0, 0, 0, 0, 0, 0 );
+      return scalar_operator_reference_type( m_map.reference() );
+    }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  operator()( scalar_operator_index_type i0
+            , const int i1 = 0 , const int i2 = 0 , const int i3 = 0
+            , const int i4 = 0 , const int i5 = 0 , const int i6 = 0 , const int i7 = 0 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 );
+      return m_map.reference();
+    }
+
+  // Rank == 1
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Rank == 1 && std::is_integral<I0>::value
+                          ), reference_type >::type
+  operator[]( const I0 & i0 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, 0, 0, 0, 0, 0, 0, 0 );
+      return m_map.reference(i0);
+    }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Rank == 1 && std::is_integral<I0>::value
+                          ), reference_type >::type
+  operator()( const I0 & i0 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, 0, 0, 0, 0, 0, 0, 0 );
+      return m_map.reference(i0);
+    }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  operator()( const I0 & i0
+            , typename std::enable_if<( Rank == 1 && std::is_integral<I0>::value ), const int >::type i1
+            , const int i2 = 0 , const int i3 = 0
+            , const int i4 = 0 , const int i5 = 0 , const int i6 = 0 , const int i7 = 0 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 );
+      return m_map.reference(i0);
+    }
+
+  // Rank == 2
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Rank == 2 &&
+                            std::is_integral<I0>::value &&
+                            std::is_integral<I1>::value
+                          ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, 0, 0, 0, 0, 0, 0 );
+      return m_map.reference(i0,i1);
+    }
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  operator()( const I0 & i0 , const I1 & i1
+            , typename std::enable_if<( Rank == 2 &&
+                                        std::is_integral<I0>::value &&
+                                        std::is_integral<I1>::value
+                                      ), const int >::type i2
+            , const int i3 = 0
+            , const int i4 = 0 , const int i5 = 0 , const int i6 = 0 , const int i7 = 0 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 );
+      return m_map.reference(i0,i1);
+    }
+
+  // Rank == 3
+
+  template< typename I0 , typename I1 , typename I2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Rank == 3 &&
+                            std::is_integral<I0>::value &&
+                            std::is_integral<I1>::value &&
+                            std::is_integral<I2>::value 
+                          ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, 0, 0, 0, 0, 0 );
+      return m_map.reference(i0,i1,i2);
+    }
+
+  template< typename I0 , typename I1 , typename I2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2
+            , typename std::enable_if<( Rank == 3 &&
+                                        std::is_integral<I0>::value &&
+                                        std::is_integral<I1>::value &&
+                                        std::is_integral<I2>::value
+                                      ), const int >::type i3
+            , const int i4 = 0 , const int i5 = 0 , const int i6 = 0 , const int i7 = 0 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 );
+      return m_map.reference(i0,i1,i2);
+    }
+
+  // Rank == 4
+
+  template< typename I0 , typename I1 , typename I2 , typename I3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Rank == 4 &&
+                            std::is_integral<I0>::value &&
+                            std::is_integral<I1>::value &&
+                            std::is_integral<I2>::value &&
+                            std::is_integral<I3>::value
+                          ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, 0, 0, 0, 0 );
+      return m_map.reference(i0,i1,i2,i3);
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , typename std::enable_if<( Rank == 4 &&
+                                        std::is_integral<I0>::value &&
+                                        std::is_integral<I1>::value &&
+                                        std::is_integral<I2>::value &&
+                                        std::is_integral<I3>::value
+                                      ), const int >::type i4
+            , const int i5 = 0 , const int i6 = 0 , const int i7 = 0 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 );
+      return m_map.reference(i0,i1,i2,i3);
+    }
+
+  // Rank == 5
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Rank == 5 &&
+                            std::is_integral<I0>::value &&
+                            std::is_integral<I1>::value &&
+                            std::is_integral<I2>::value &&
+                            std::is_integral<I3>::value &&
+                            std::is_integral<I4>::value 
+                          ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, 0, 0, 0 );
+      return m_map.reference(i0,i1,i2,i3,i4);
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4
+            , typename std::enable_if<( Rank == 5 &&
+                                        std::is_integral<I0>::value &&
+                                        std::is_integral<I1>::value &&
+                                        std::is_integral<I2>::value &&
+                                        std::is_integral<I3>::value &&
+                                        std::is_integral<I4>::value
+                                      ), const int >::type i5
+            , const int i6 = 0 , const int i7 = 0 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 );
+      return m_map.reference(i0,i1,i2,i3,i4);
+    }
+
+  // Rank == 6
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Rank == 6 &&
+                            std::is_integral<I0>::value &&
+                            std::is_integral<I1>::value &&
+                            std::is_integral<I2>::value &&
+                            std::is_integral<I3>::value &&
+                            std::is_integral<I4>::value &&
+                            std::is_integral<I5>::value
+                          ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, 0, 0 );
+      return m_map.reference(i0,i1,i2,i3,i4,i5);
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5
+            , typename std::enable_if<( Rank == 6 &&
+                                        std::is_integral<I0>::value &&
+                                        std::is_integral<I1>::value &&
+                                        std::is_integral<I2>::value &&
+                                        std::is_integral<I3>::value &&
+                                        std::is_integral<I4>::value
+                                      ), const int >::type i6
+            , const int i7 = 0 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 );
+      return m_map.reference(i0,i1,i2,i3,i4,i5);
+    }
+
+  // Rank == 7
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Rank == 7 &&
+                            std::is_integral<I0>::value &&
+                            std::is_integral<I1>::value &&
+                            std::is_integral<I2>::value &&
+                            std::is_integral<I3>::value &&
+                            std::is_integral<I4>::value &&
+                            std::is_integral<I5>::value &&
+                            std::is_integral<I6>::value 
+                          ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, 0 );
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
+    }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6
+            , typename std::enable_if<( Rank == 7 &&
+                                        std::is_integral<I0>::value &&
+                                        std::is_integral<I1>::value &&
+                                        std::is_integral<I2>::value &&
+                                        std::is_integral<I3>::value &&
+                                        std::is_integral<I4>::value
+                                      ), const int >::type i7
+            ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 );
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6);
+    }
+
+  // Rank == 8
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if<( Rank == 8 &&
+                            std::is_integral<I0>::value &&
+                            std::is_integral<I1>::value &&
+                            std::is_integral<I2>::value &&
+                            std::is_integral<I3>::value &&
+                            std::is_integral<I4>::value &&
+                            std::is_integral<I5>::value &&
+                            std::is_integral<I6>::value &&
+                            std::is_integral<I7>::value
+                          ), reference_type >::type
+  operator()( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+            , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 ) const
+    {
+      KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( typename traits::memory_space, m_map, Rank, i0, i1, i2, i3, i4, i5, i6, i7 );
+      return m_map.reference(i0,i1,i2,i3,i4,i5,i6,i7);
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  ~View() {}
+
+  KOKKOS_INLINE_FUNCTION
+  View() : m_track(), m_map() {}
+
+  KOKKOS_INLINE_FUNCTION
+  View( const View & rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  View( View && rhs ) : m_track( rhs.m_track ), m_map( rhs.m_map ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View & rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( View && rhs ) { m_track = rhs.m_track ; m_map = rhs.m_map ; return *this ; }
+
+  //----------------------------------------
+
+  template< class RT , class R1 , class R2 , class R3 >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<RT,R1,R2,R3> & rhs )
+    : m_track( rhs.m_track )
+    , m_map()
+    {
+      typedef typename View<RT,R1,R2,R3>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View copy construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+    }
+
+  template< class RT , class R1 , class R2 , class R3 >
+  KOKKOS_INLINE_FUNCTION
+  View( View<RT,R1,R2,R3> && rhs )
+    : m_track( rhs.m_track )
+    , m_map()
+    {
+      typedef typename View<RT,R1,R2,R3>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View move construction" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+    }
+
+  template< class RT , class R1 , class R2 , class R3 >
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View<RT,R1,R2,R3> & rhs )
+    {
+      typedef typename View<RT,R1,R2,R3>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View copy assignment" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+      m_track.operator=( rhs.m_track );
+      return *this ;
+    }
+
+  template< class RT , class R1 , class R2 , class R3 >
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( View<RT,R1,R2,R3> && rhs )
+    {
+      typedef typename View<RT,R1,R2,R3>::traits  SrcTraits ;
+      typedef Kokkos::Experimental::Impl::ViewMapping< traits , SrcTraits >  Mapping ;
+      static_assert( Mapping::is_assignable , "Incompatible View move assignment" );
+      Mapping::assign( m_map , rhs.m_map , rhs.m_track );
+      m_track.operator=( rhs.m_track );
+      return *this ;
+    }
+
+  //----------------------------------------
+  // Allocation according to allocation properties
+
+private:
+
+  // Must call destructor for non-trivial types
+  template< class ExecSpace >
+  struct DestroyFunctor {
+    map_type  m_map ;
+    ExecSpace m_space ;
+
+    KOKKOS_INLINE_FUNCTION
+    void destroy_shared_allocation() { m_map.destroy( m_space ); }
+  };
+
+public:
+
+  inline
+  const std::string label() const { return m_track.template get_label< typename traits::memory_space >(); }
+
+  template< class Prop >
+  explicit inline
+  View( const Prop & arg_prop
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : m_track()
+    , m_map()
+    {
+      // Merge the < execution_space , memory_space > into the properties.
+      typedef Kokkos::Experimental::Impl::ViewAllocProp< typename traits::device_type , Prop >  alloc_prop ;
+
+      typedef typename alloc_prop::execution_space  execution_space ;
+      typedef typename traits::memory_space         memory_space ;
+      typedef DestroyFunctor< execution_space >     destroy_functor ;
+      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< memory_space , destroy_functor >  record_type ;
+
+      static_assert( traits::is_managed , "View allocation constructor requires managed memory" );
+
+      const alloc_prop prop( arg_prop );
+
+      // Query the mapping for byte-size of allocation.
+      const size_t alloc_size = map_type::memory_span( prop.allow_padding
+                                                     , arg_N0 , arg_N1 , arg_N2 , arg_N3
+                                                     , arg_N4 , arg_N5 , arg_N6 , arg_N7 );
+
+      // Allocate memory from the memory space.
+      record_type * const record = record_type::allocate( prop.memory , prop.label , alloc_size );
+
+      // Construct the mapping object prior to start of tracking
+      // to assign destroy functor and possibly initialize.
+      m_map = map_type( record->data()
+                      , prop.allow_padding
+                      , arg_N0 , arg_N1 , arg_N2 , arg_N3
+                      , arg_N4 , arg_N5 , arg_N6 , arg_N7 );
+
+      // Copy the destroy functor into the allocation record before initiating tracking.
+      record->m_destroy.m_map   = m_map ;
+      record->m_destroy.m_space = prop.execution ;
+
+      if ( prop.initialize.value ) {
+        m_map.construct( prop.execution );
+      }
+
+      // Destroy functor assigned and initialization complete, start tracking
+      m_track = track_type( record );
+    }
+
+  template< class Prop >
+  explicit inline
+  View( const Prop & arg_prop
+      , const typename traits::array_layout & arg_layout
+      )
+    : m_track()
+    , m_map()
+    {
+      // Merge the < execution_space , memory_space > into the properties.
+      typedef Kokkos::Experimental::Impl::ViewAllocProp< typename traits::device_type , Prop >  alloc_prop ;
+
+      typedef typename alloc_prop::execution_space  execution_space ;
+      typedef typename traits::memory_space         memory_space ;
+      typedef DestroyFunctor< execution_space >     destroy_functor ;
+      typedef Kokkos::Experimental::Impl::SharedAllocationRecord< memory_space , destroy_functor >  record_type ;
+
+      static_assert( traits::is_managed , "View allocation constructor requires managed memory" );
+
+      const alloc_prop prop( arg_prop );
+
+      // Query the mapping for byte-size of allocation.
+      const size_t alloc_size = map_type::memory_span( prop.allow_padding , arg_layout );
+
+      // Allocate memory from the memory space.
+      record_type * const record = record_type::allocate( prop.memory , prop.label , alloc_size );
+
+      // Construct the mapping object prior to start of tracking
+      // to assign destroy functor and possibly initialize.
+      m_map = map_type( record->data() , prop.allow_padding , arg_layout );
+
+      // Copy the destroy functor into the allocation record before initiating tracking.
+      record->m_destroy.m_map   = m_map ;
+      record->m_destroy.m_space = prop.execution ;
+
+      if ( prop.initialize.value ) {
+        m_map.construct( prop.execution );
+      }
+
+      // Destroy functor assigned and initialization complete, start tracking
+      m_track = track_type( record );
+    }
+
+  //----------------------------------------
+  // Memory span required to wrap these dimensions.
+  static constexpr size_t memory_span( const size_t arg_N0 = 0
+                                     , const size_t arg_N1 = 0
+                                     , const size_t arg_N2 = 0
+                                     , const size_t arg_N3 = 0
+                                     , const size_t arg_N4 = 0
+                                     , const size_t arg_N5 = 0
+                                     , const size_t arg_N6 = 0
+                                     , const size_t arg_N7 = 0
+                                     )
+    {
+      return map_type::memory_span( std::integral_constant<bool,false>()
+                                  , arg_N0 , arg_N1 , arg_N2 , arg_N3
+                                  , arg_N4 , arg_N5 , arg_N6 , arg_N7 );
+    }
+
+  explicit inline
+  View( typename traits::value_type * const arg_ptr
+      , const size_t arg_N0 = 0
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0
+      )
+    : m_track() // No memory tracking
+    , m_map( arg_ptr , std::integral_constant<bool,false>()
+           , arg_N0 , arg_N1 , arg_N2 , arg_N3
+           , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+    {}
+
+  explicit inline
+  View( typename traits::value_type * const arg_ptr
+      , typename traits::array_layout & arg_layout
+      )
+    : m_track() // No memory tracking
+    , m_map( arg_ptr , std::integral_constant<bool,false>(), arg_layout )
+    {}
+
+  //----------------------------------------
+  // Shared scratch memory constructor
+
+  static inline
+  size_t shmem_size( const size_t arg_N0 = 0 ,
+                     const size_t arg_N1 = 0 ,
+                     const size_t arg_N2 = 0 ,
+                     const size_t arg_N3 = 0 ,
+                     const size_t arg_N4 = 0 ,
+                     const size_t arg_N5 = 0 ,
+                     const size_t arg_N6 = 0 ,
+                     const size_t arg_N7 = 0 )
+  {
+    return map_type::memory_span( std::integral_constant<bool,false>()
+                                , arg_N0 , arg_N1 , arg_N2 , arg_N3
+                                , arg_N4 , arg_N5 , arg_N6 , arg_N7 );
+  }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const typename traits::execution_space::scratch_memory_space & arg_space
+      , const size_t arg_N0 = 0 
+      , const size_t arg_N1 = 0
+      , const size_t arg_N2 = 0
+      , const size_t arg_N3 = 0
+      , const size_t arg_N4 = 0
+      , const size_t arg_N5 = 0
+      , const size_t arg_N6 = 0
+      , const size_t arg_N7 = 0 )
+    : m_track() // No memory tracking
+    , m_map( arg_space.get_shmem( map_type::memory_span( std::integral_constant<bool,false>()
+                                                       , arg_N0 , arg_N1 , arg_N2 , arg_N3
+                                                       , arg_N4 , arg_N5 , arg_N6 , arg_N7 ) )
+           , std::integral_constant<bool,false>() 
+           , arg_N0 , arg_N1 , arg_N2 , arg_N3
+           , arg_N4 , arg_N5 , arg_N6 , arg_N7 )
+    {}
+
+  //----------------------------------------
+  // Subviews
+
+private:
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( const track_type & rhs )
+    : m_track( rhs )
+    , m_map()
+    {}
+
+public:
+
+  template< class D , class A1 , class A2 , class A3
+          , class T0 , class T1 , class T2 , class T3
+          , class T4 , class T5 , class T6 , class T7 >
+  friend
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 >
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T7>::is_range
+    >
+  subview( const View< D , A1 , A2 , A3 > & src
+         , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3
+         , T4 const & arg4 , T5 const & arg5 , T6 const & arg6 , T7 const & arg7
+         );
+  
+  template< class D , class A1 , class A2 , class A3
+          , class T0 , class T1 , class T2 , class T3
+          , class T4 , class T5 , class T6 >
+  friend
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 >
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range
+    >
+  subview( const View< D , A1 , A2 , A3 > & src
+         , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3
+         , T4 const & arg4 , T5 const & arg5 , T6 const & arg6
+         );
+
+  template< class D , class A1 , class A2 , class A3
+          , class T0 , class T1 , class T2 , class T3
+          , class T4 , class T5 >
+  friend
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 >
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range
+    >
+  subview( const View< D , A1 , A2 , A3 > & src
+         , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3
+         , T4 const & arg4 , T5 const & arg5
+         );
+  
+  template< class D , class A1 , class A2 , class A3
+          , class T0 , class T1 , class T2 , class T3
+          , class T4 >
+  friend
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 >
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+    >
+  subview( const View< D , A1 , A2 , A3 > & src
+         , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3
+         , T4 const & arg4
+         );
+  
+  template< class D , class A1 , class A2 , class A3
+          , class T0 , class T1 , class T2 , class T3 >
+  friend
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 >
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+    >
+  subview( const View< D , A1 , A2 , A3 > & src
+         , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3
+         );
+  
+  template< class D , class A1 , class A2 , class A3
+          , class T0 , class T1 , class T2 >
+  friend
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 >
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    >
+  subview( const View< D , A1 , A2 , A3 > & src
+         , T0 const & arg0 , T1 const & arg1 , T2 const & arg2
+         );
+
+  template< class D , class A1 , class A2 , class A3
+          , class T0 , class T1 >
+  friend
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::Subview< View< D , A1 , A2 , A3 >
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    >
+  subview( const View< D , A1 , A2 , A3 > & src
+         , T0 const & arg0 , T1 const & arg1
+         );
+  
+  template< class D, class A1, class A2, class A3, class T0 >
+  friend
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::Subview< View< D, A1, A2, A3 >
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    >
+  subview( const View< D, A1, A2, A3 > & src , T0 const & arg0 );
+
+};
+
+template< class > struct is_view : public std::false_type {};
+
+template< class D, class A1, class A2, class A3 >
+struct is_view< View<D,A1,A2,A3> > : public std::true_type {};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class D, class A1, class A2, class A3
+        , class T0 , class T1 , class T2 , class T3
+        , class T4 , class T5 , class T6 , class T7 >
+KOKKOS_INLINE_FUNCTION
+Kokkos::Experimental::Subview< View< D, A1, A2, A3 >
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T7>::is_range
+  >
+subview( const View< D, A1, A2, A3 > & src
+       , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3
+       , T4 const & arg4 , T5 const & arg5 , T6 const & arg6 , T7 const & arg7
+       )
+{
+  typedef View< D, A1, A2, A3 >  SrcView ;
+
+  typedef Kokkos::Experimental::Impl::SubviewMapping
+    < typename SrcView::traits
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T7>::is_range
+    > Mapping ;
+
+  typedef typename Mapping::type  DstView ;
+
+  static_assert( SrcView::Rank == 8 , "Subview of rank 8 View requires 8 arguments" );
+
+  DstView dst( src.m_track );
+
+  Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 );
+
+  return dst ;
+}
+
+template< class D, class A1, class A2, class A3
+        , class T0 , class T1 , class T2 , class T3
+        , class T4 , class T5 , class T6 >
+KOKKOS_INLINE_FUNCTION
+Kokkos::Experimental::Subview< View< D, A1, A2, A3 >
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range
+  >
+subview( const View< D, A1, A2, A3 > & src
+       , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3
+       , T4 const & arg4 , T5 const & arg5 , T6 const & arg6
+       )
+{
+  typedef View< D, A1, A2, A3 >  SrcView ;
+
+  typedef Kokkos::Experimental::Impl::SubviewMapping
+    < typename SrcView::traits
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T6>::is_range
+    >  Mapping ;
+
+  typedef typename Mapping::type  DstView ;
+
+  static_assert( SrcView::Rank == 7 , "Subview of rank 7 View requires 7 arguments" );
+
+  DstView dst( src.m_track );
+
+  Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, arg3, arg4, arg5, arg6, 0 );
+
+  return dst ;
+}
+
+template< class D, class A1, class A2, class A3
+        , class T0 , class T1 , class T2 , class T3
+        , class T4 , class T5 >
+KOKKOS_INLINE_FUNCTION
+Kokkos::Experimental::Subview< View< D, A1, A2, A3 >
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range
+  >
+subview( const View< D, A1, A2, A3 > & src
+       , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3
+       , T4 const & arg4 , T5 const & arg5
+       )
+{
+  typedef View< D, A1, A2, A3 >  SrcView ;
+
+  typedef Kokkos::Experimental::Impl::SubviewMapping
+    < typename SrcView::traits
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T5>::is_range
+    >  Mapping ;
+
+  typedef typename Mapping::type  DstView ;
+
+  static_assert( SrcView::Rank == 6 , "Subview of rank 6 View requires 6 arguments" );
+
+  DstView dst( src.m_track );
+
+  Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, arg3, arg4, arg5, 0, 0 );
+
+  return dst ;
+}
+
+template< class D, class A1, class A2, class A3
+        , class T0 , class T1 , class T2 , class T3
+        , class T4 >
+KOKKOS_INLINE_FUNCTION
+Kokkos::Experimental::Subview< View< D, A1, A2, A3 >
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+  >
+subview( const View< D, A1, A2, A3 > & src
+       , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3
+       , T4 const & arg4
+       )
+{
+  typedef View< D, A1, A2, A3 >  SrcView ;
+
+  typedef Kokkos::Experimental::Impl::SubviewMapping
+    < typename SrcView::traits
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T4>::is_range
+    >  Mapping ;
+
+  typedef typename Mapping::type  DstView ;
+
+  static_assert( SrcView::Rank == 5 , "Subview of rank 5 View requires 5 arguments" );
+
+  DstView dst( src.m_track );
+
+  Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, arg3, arg4, 0, 0, 0 );
+
+  return dst ;
+}
+
+template< class D, class A1, class A2, class A3
+        , class T0 , class T1 , class T2 , class T3 >
+KOKKOS_INLINE_FUNCTION
+Kokkos::Experimental::Subview< View< D, A1, A2, A3 >
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+  >
+subview( const View< D, A1, A2, A3 > & src
+       , T0 const & arg0 , T1 const & arg1 , T2 const & arg2 , T3 const & arg3
+       )
+{
+  typedef View< D, A1, A2, A3 >  SrcView ;
+
+  typedef Kokkos::Experimental::Impl::SubviewMapping
+    < typename SrcView::traits
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T3>::is_range
+    >  Mapping ;
+
+  typedef typename Mapping::type  DstView ;
+
+  static_assert( SrcView::Rank == 4 , "Subview of rank 4 View requires 4 arguments" );
+
+  DstView dst( src.m_track );
+
+  Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, arg3, 0, 0, 0, 0 );
+
+  return dst ;
+}
+
+template< class D, class A1, class A2, class A3
+        , class T0 , class T1 , class T2 >
+KOKKOS_INLINE_FUNCTION
+Kokkos::Experimental::Subview< View< D, A1, A2, A3 >
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+  >
+subview( const View< D, A1, A2, A3 > & src
+       , T0 const & arg0 , T1 const & arg1 , T2 const & arg2
+       )
+{
+  typedef View< D, A1, A2, A3 >  SrcView ;
+
+  typedef Kokkos::Experimental::Impl::SubviewMapping
+    < typename SrcView::traits
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T2>::is_range
+    >  Mapping ;
+
+  typedef typename Mapping::type  DstView ;
+
+  static_assert( SrcView::Rank == 3 , "Subview of rank 3 View requires 3 arguments" );
+
+  DstView dst( src.m_track );
+
+  Mapping::assign( dst.m_map, src.m_map, arg0, arg1, arg2, 0, 0, 0, 0, 0 );
+
+  return dst ;
+}
+
+template< class D, class A1, class A2, class A3
+        , class T0 , class T1 >
+KOKKOS_INLINE_FUNCTION
+Kokkos::Experimental::Subview< View< D, A1, A2, A3 >
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+  >
+subview( const View< D, A1, A2, A3 > & src
+       , T0 const & arg0 , T1 const & arg1
+       )
+{
+  typedef View< D, A1, A2, A3 >  SrcView ;
+
+  typedef Kokkos::Experimental::Impl::SubviewMapping
+    < typename SrcView::traits
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T1>::is_range
+    >  Mapping ;
+
+  typedef typename Mapping::type  DstView ;
+
+  static_assert( SrcView::Rank == 2 , "Subview of rank 2 View requires 2 arguments" );
+
+  DstView dst( src.m_track );
+
+  Mapping::assign( dst.m_map, src.m_map, arg0, arg1, 0, 0, 0, 0, 0, 0 );
+
+  return dst ;
+}
+
+template< class D, class A1, class A2, class A3, class T0 >
+KOKKOS_INLINE_FUNCTION
+Kokkos::Experimental::Subview< View< D, A1, A2, A3 >
+  , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+  >
+subview( const View< D, A1, A2, A3 > & src , T0 const & arg0 )
+{
+  typedef View< D, A1, A2, A3 >  SrcView ;
+
+  typedef Kokkos::Experimental::Impl::SubviewMapping
+    < typename SrcView::traits
+    , Kokkos::Experimental::Impl::ViewOffsetRange<T0>::is_range
+    >  Mapping ;
+
+  typedef typename Mapping::type  DstView ;
+
+  static_assert( SrcView::Rank == 1 , "Subview of rank 1 View requires 1 arguments" );
+
+  DstView dst( src.m_track );
+
+  Mapping::assign( dst.m_map , src.m_map , arg0, 0, 0, 0, 0, 0, 0, 0 );
+
+  return dst ;
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class LT , class L1 , class L2 , class L3
+        , class RT , class R1 , class R2 , class R3 >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const View<LT,L1,L2,L3> & lhs ,
+                   const View<RT,R1,R2,R3> & rhs )
+{
+  // Same data, layout, dimensions
+  typedef ViewTraits<LT,L1,L2,L3>  lhs_traits ;
+  typedef ViewTraits<RT,R1,R2,R3>  rhs_traits ;
+
+  return
+    std::is_same< typename lhs_traits::const_value_type ,
+                  typename rhs_traits::const_value_type >::value &&
+    std::is_same< typename lhs_traits::array_layout ,
+                  typename rhs_traits::array_layout >::value &&
+    std::is_same< typename lhs_traits::memory_space ,
+                  typename rhs_traits::memory_space >::value &&
+    lhs_traits::Rank == rhs_traits::Rank &&
+    lhs.data()        == rhs.data() &&
+    lhs.span()        == rhs.span() &&
+    lhs.dimension_0() == rhs.dimension_0() &&
+    lhs.dimension_1() == rhs.dimension_1() &&
+    lhs.dimension_2() == rhs.dimension_2() &&
+    lhs.dimension_3() == rhs.dimension_3() &&
+    lhs.dimension_4() == rhs.dimension_4() &&
+    lhs.dimension_5() == rhs.dimension_5() &&
+    lhs.dimension_6() == rhs.dimension_6() &&
+    lhs.dimension_7() == rhs.dimension_7();
+}
+
+template< class LT , class L1 , class L2 , class L3
+        , class RT , class R1 , class R2 , class R3 >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const View<LT,L1,L2,L3> & lhs ,
+                   const View<RT,R1,R2,R3> & rhs )
+{
+  return ! ( operator==(lhs,rhs) );
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class OutputView , typename Enable = void >
+struct ViewFill {
+
+  typedef typename OutputView::const_value_type  const_value_type ;
+
+  const OutputView output ;
+  const_value_type input ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    const size_t n1 = output.dimension_1();
+    const size_t n2 = output.dimension_2();
+    const size_t n3 = output.dimension_3();
+    const size_t n4 = output.dimension_4();
+    const size_t n5 = output.dimension_5();
+    const size_t n6 = output.dimension_6();
+    const size_t n7 = output.dimension_7();
+
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+    for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) {
+      output(i0,i1,i2,i3,i4,i5,i6,i7) = input ;
+    }}}}}}}
+  }
+
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+    : output( arg_out ), input( arg_in )
+    {
+      typedef typename OutputView::execution_space  execution_space ;
+      typedef Kokkos::RangePolicy< execution_space > Policy ;
+
+      (void) Kokkos::Impl::ParallelFor< ViewFill , Policy >( *this , Policy( 0 , output.dimension_0() ) );
+
+      execution_space::fence();
+    }
+};
+
+template< class OutputView >
+struct ViewFill< OutputView , typename std::enable_if< OutputView::Rank == 0 >::type > {
+  ViewFill( const OutputView & dst , const typename OutputView::const_value_type & src )
+    {
+      Kokkos::Impl::DeepCopy< typename OutputView::memory_space , Kokkos::HostSpace >
+        ( dst.data() , & src , sizeof(typename OutputView::const_value_type) );
+    }
+};
+
+template< class OutputView , class InputView >
+struct ViewRemap {
+
+  const OutputView output ;
+  const InputView  input ;
+  const size_t n0 ;
+  const size_t n1 ;
+  const size_t n2 ;
+  const size_t n3 ;
+  const size_t n4 ;
+  const size_t n5 ;
+  const size_t n6 ;
+  const size_t n7 ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+    : output( arg_out ), input( arg_in )
+    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
+    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
+    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
+    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
+    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
+    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
+    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
+    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
+    {
+      typedef typename OutputView::execution_space execution_space ;
+      typedef Kokkos::RangePolicy< execution_space > Policy ;
+      (void) Kokkos::Impl::ParallelFor< ViewRemap , Policy >( *this , Policy( 0 , n0 ) );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_t i0 ) const
+  {
+    for ( size_t i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_t i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_t i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_t i6 = 0 ; i6 < n6 ; ++i6 ) {
+    for ( size_t i7 = 0 ; i7 < n7 ; ++i7 ) {
+      output(i0,i1,i2,i3,i4,i5,i6,i7) = input(i0,i1,i2,i3,i4,i5,i6,i7);
+    }}}}}}}
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \brief  Deep copy a value from Host memory into a view.  */
+template< class DT , class D1 , class D2 , class D3 >
+inline
+void deep_copy( const View<DT,D1,D2,D3> & dst
+              , typename ViewTraits<DT,D1,D2,D3>::const_value_type & value )
+{
+  static_assert( std::is_same< typename ViewTraits<DT,D1,D2,D3>::non_const_value_type ,
+                               typename ViewTraits<DT,D1,D2,D3>::value_type >::value
+               , "ERROR: Incompatible deep_copy( View , value )" );
+
+  Kokkos::Experimental::Impl::ViewFill< View<DT,D1,D2,D3> >( dst , value );
+}
+
+/** \brief  Deep copy into a value in Host memory from a view.  */
+template< class ST , class S1 , class S2 , class S3 >
+inline
+void deep_copy( ST & dst , const View<ST,S1,S2,S3> & src )
+{
+  static_assert( ViewTraits<ST,S1,S2,S3>::rank == 0 
+               , "ERROR: Non-rank-zero view in deep_copy( value , View )" );
+
+  typedef ViewTraits<ST,S1,S2,S3>            src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Kokkos::Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.data() , sizeof(ST) );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of compatible type, and rank zero.  */
+template< class DT , class D1 , class D2 , class D3
+        , class ST , class S1 , class S2 , class S3 >
+inline
+void deep_copy( const View<DT,D1,D2,D3> & dst ,
+                const View<ST,S1,S2,S3> & src ,
+                typename std::enable_if<(
+                  // Rank zero:
+                  ( unsigned(ViewTraits<DT,D1,D2,D3>::rank) == unsigned(0) ) &&
+                  ( unsigned(ViewTraits<ST,S1,S2,S3>::rank) == unsigned(0) ) &&
+                  // Same type and destination is not constant:
+                  std::is_same< typename ViewTraits<DT,D1,D2,D3>::value_type ,
+                                typename ViewTraits<ST,S1,S2,S3>::non_const_value_type >::value
+                )>::type * = 0 )
+{
+  typedef View<DT,D1,D2,D3>  dst_type ;
+  typedef View<ST,S1,S2,S3>  src_type ;
+
+  typedef typename dst_type::value_type    value_type ;
+  typedef typename dst_type::memory_space  dst_memory_space ;
+  typedef typename src_type::memory_space  src_memory_space ;
+
+  if ( dst.data() != src.data() ) {
+    Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , sizeof(value_type) );
+  }
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of the default specialization, compatible type,
+ *          same non-zero rank, same contiguous layout.
+ */
+template< class DT , class D1 , class D2 , class D3 ,
+          class ST , class S1 , class S2 , class S3 >
+inline
+void deep_copy( const View<DT,D1,D2,D3> & dst ,
+                const View<ST,S1,S2,S3> & src ,
+                typename std::enable_if<(
+                  // destination is non-const.
+                  std::is_same< typename ViewTraits<DT,D1,D2,D3>::value_type ,
+                                typename ViewTraits<DT,D1,D2,D3>::non_const_value_type >::value
+                  &&
+                  // Same non-zero rank:
+                  ( unsigned(ViewTraits<DT,D1,D2,D3>::rank) != 0 )
+                  &&
+                  ( unsigned(ViewTraits<DT,D1,D2,D3>::rank) ==
+                    unsigned(ViewTraits<ST,S1,S2,S3>::rank) )
+                  &&
+                  // Not specialized, default ViewMapping
+                  std::is_same< typename ViewTraits<DT,D1,D2,D3>::specialize , void >::value
+                  &&
+                  std::is_same< typename ViewTraits<ST,S1,S2,S3>::specialize , void >::value
+                )>::type * = 0 )
+{
+  typedef View<DT,D1,D2,D3>  dst_type ;
+  typedef View<ST,S1,S2,S3>  src_type ;
+
+  typedef typename dst_type::execution_space  dst_execution_space ;
+  typedef typename dst_type::memory_space     dst_memory_space ;
+  typedef typename src_type::memory_space     src_memory_space ;
+
+  enum { DstExecCanAccessSrc =
+   Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename dst_execution_space::memory_space , src_memory_space >::value };
+
+  if ( (void *) dst.data() != (void*) src.data() ) {
+
+    // Concern: If overlapping views then a parallel copy will be erroneous.
+    // ...
+
+    // If same type, equal layout, equal dimensions, equal span, and contiguous memory then can byte-wise copy
+
+    if ( std::is_same< typename ViewTraits<DT,D1,D2,D3>::value_type ,
+                       typename ViewTraits<ST,S1,S2,S3>::non_const_value_type >::value &&
+         std::is_same< typename ViewTraits<DT,D1,D2,D3>::array_layout ,
+                       typename ViewTraits<ST,S1,S2,S3>::array_layout >::value &&
+         dst.span_is_contiguous() &&
+         src.span_is_contiguous() &&
+         dst.span() == src.span() &&
+         dst.dimension_0() == src.dimension_0() &&
+         dst.dimension_1() == src.dimension_1() &&
+         dst.dimension_2() == src.dimension_2() &&
+         dst.dimension_3() == src.dimension_3() &&
+         dst.dimension_4() == src.dimension_4() &&
+         dst.dimension_5() == src.dimension_5() &&
+         dst.dimension_6() == src.dimension_6() &&
+         dst.dimension_7() == src.dimension_7() ) {
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.span();
+
+      Kokkos::Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.data() , src.data() , nbytes );
+    }
+    else if ( DstExecCanAccessSrc ) {
+      // Copying data between views in accessible memory spaces and either non-contiguous or incompatible shape.
+      Kokkos::Experimental::Impl::ViewRemap< dst_type , src_type >( dst , src );
+    }
+    else {
+      Kokkos::Impl::throw_runtime_exception("deep_copy given views that would require a temporary allocation");
+    }
+  }
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class T , class A1, class A2, class A3 >
+inline
+typename Kokkos::Experimental::View<T,A1,A2,A3>::HostMirror
+create_mirror( const Kokkos::Experimental::View<T,A1,A2,A3> & src
+             , typename std::enable_if<
+                 ! std::is_same< typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::array_layout
+                               , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef View<T,A1,A2,A3>               src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  return dst_type( std::string( src.label() ).append("_mirror")
+                 , src.dimension_0()
+                 , src.dimension_1()
+                 , src.dimension_2()
+                 , src.dimension_3()
+                 , src.dimension_4()
+                 , src.dimension_5()
+                 , src.dimension_6()
+                 , src.dimension_7() );
+}
+
+template< class T , class A1, class A2, class A3 >
+inline
+typename Kokkos::Experimental::View<T,A1,A2,A3>::HostMirror
+create_mirror( const Kokkos::Experimental::View<T,A1,A2,A3> & src
+             , typename std::enable_if<
+                 std::is_same< typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::array_layout
+                             , Kokkos::LayoutStride >::value
+               >::type * = 0
+             )
+{
+  typedef View<T,A1,A2,A3>               src_type ;
+  typedef typename src_type::HostMirror  dst_type ;
+
+  Kokkos::LayoutStride layout ;
+
+  layout.dimension[0] = src.dimension_0();
+  layout.dimension[1] = src.dimension_1();
+  layout.dimension[2] = src.dimension_2();
+  layout.dimension[3] = src.dimension_3();
+  layout.dimension[4] = src.dimension_4();
+  layout.dimension[5] = src.dimension_5();
+  layout.dimension[6] = src.dimension_6();
+  layout.dimension[7] = src.dimension_7();
+
+  layout.stride[0] = src.stride_0();
+  layout.stride[1] = src.stride_1();
+  layout.stride[2] = src.stride_2();
+  layout.stride[3] = src.stride_3();
+  layout.stride[4] = src.stride_4();
+  layout.stride[5] = src.stride_5();
+  layout.stride[6] = src.stride_6();
+  layout.stride[7] = src.stride_7();
+
+  return dst_type( std::string( src.label() ).append("_mirror") , layout );
+}
+
+template< class T , class A1 , class A2 , class A3 >
+inline
+typename Kokkos::Experimental::View<T,A1,A2,A3>::HostMirror
+create_mirror_view( const Kokkos::Experimental::View<T,A1,A2,A3> & src
+                  , typename std::enable_if<(
+                      std::is_same< typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::memory_space
+                                  , typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::host_mirror_space
+                                  >::value
+                    )>::type * = 0 
+                  )
+{
+  return src ;
+}
+
+template< class T , class A1 , class A2 , class A3 >
+inline
+typename Kokkos::Experimental::View<T,A1,A2,A3>::HostMirror
+create_mirror_view( const Kokkos::Experimental::View<T,A1,A2,A3> & src
+                  , typename std::enable_if<(
+                      ! std::is_same< typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::memory_space
+                                    , typename Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::host_mirror_space
+                                    >::value
+                    )>::type * = 0 
+                  )
+{
+  return Kokkos::Experimental::create_mirror( src );
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class A1 , class A2 , class A3 >
+inline
+void resize( Kokkos::Experimental::View<T,A1,A2,A3> & v ,
+             const size_t n0 = 0 ,
+             const size_t n1 = 0 ,
+             const size_t n2 = 0 ,
+             const size_t n3 = 0 ,
+             const size_t n4 = 0 ,
+             const size_t n5 = 0 ,
+             const size_t n6 = 0 ,
+             const size_t n7 = 0 )
+{
+  typedef Kokkos::Experimental::View<T,A1,A2,A3>  view_type ;
+
+  static_assert( Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::is_managed , "Can only resize managed views" );
+
+  view_type v_resized( v.label(), n0, n1, n2, n3, n4, n5, n6, n7 );
+
+  Kokkos::Experimental::Impl::ViewRemap< view_type , view_type >( v_resized , v );
+
+  v = v_resized ;
+}
+
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class A1 , class A2 , class A3 >
+inline
+void realloc( Kokkos::Experimental::View<T,A1,A2,A3> & v ,
+              const size_t n0 = 0 ,
+              const size_t n1 = 0 ,
+              const size_t n2 = 0 ,
+              const size_t n3 = 0 ,
+              const size_t n4 = 0 ,
+              const size_t n5 = 0 ,
+              const size_t n6 = 0 ,
+              const size_t n7 = 0 )
+{
+  typedef Kokkos::Experimental::View<T,A1,A2,A3>  view_type ;
+
+  static_assert( Kokkos::Experimental::ViewTraits<T,A1,A2,A3>::is_managed , "Can only realloc managed views" );
+
+  const std::string label = v.label();
+
+  v = view_type(); // Deallocate first, if the only view to allocation
+  v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 );
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+namespace Kokkos {
+
+template< class D , class A1 = void , class A2 = void , class A3 = void >
+using ViewTraits = Kokkos::Experimental::ViewTraits<D,A1,A2,A3> ;
+
+template< class D , class A1 = void , class A2 = void , class A3 = void , class S = void >
+using View = Kokkos::Experimental::View<D,A1,A2,A3> ;
+
+using Kokkos::Experimental::deep_copy ;
+using Kokkos::Experimental::create_mirror ;
+using Kokkos::Experimental::create_mirror_view ;
+using Kokkos::Experimental::subview ;
+using Kokkos::Experimental::resize ;
+using Kokkos::Experimental::realloc ;
+
+namespace Impl {
+
+using Kokkos::Experimental::is_view ;
+
+class ViewDefault {};
+
+template< class SrcViewType
+        , class Arg0Type
+        , class Arg1Type
+        , class Arg2Type
+        , class Arg3Type
+        , class Arg4Type
+        , class Arg5Type
+        , class Arg6Type
+        , class Arg7Type
+        >
+struct ViewSubview /* { typedef ... type ; } */ ;
+
+}
+
+} /* namespace Kokkos */
+
+#include <impl/Kokkos_Atomic_View.hpp>
+
+#endif /* #if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..60009e6d4dd66aa7d817e294203363797133664d
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp
@@ -0,0 +1,285 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Atomic.hpp
+/// \brief Atomic functions
+///
+/// This header file defines prototypes for the following atomic functions:
+///   - exchange
+///   - compare and exchange
+///   - add
+///
+/// Supported types include:
+///   - signed and unsigned 4 and 8 byte integers
+///   - float
+///   - double
+///
+/// They are implemented through GCC compatible intrinsics, OpenMP
+/// directives and native CUDA intrinsics.
+///
+/// Including this header file requires one of the following
+/// compilers:
+///   - NVCC (for CUDA device code only)
+///   - GCC (for host code only)
+///   - Intel (for host code only)
+///   - A compiler that supports OpenMP 3.1 (for host code only)
+
+#ifndef KOKKOS_ATOMIC_HPP
+#define KOKKOS_ATOMIC_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_Traits.hpp>
+
+//----------------------------------------------------------------------------
+#if defined(_WIN32)
+#define KOKKOS_ATOMICS_USE_WINDOWS
+#else
+#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+
+// Compiling NVIDIA device code, must use Cuda atomics:
+
+#define KOKKOS_ATOMICS_USE_CUDA
+
+#elif ! defined( KOKKOS_ATOMICS_USE_GCC ) && \
+      ! defined( KOKKOS_ATOMICS_USE_INTEL ) && \
+      ! defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+// Compiling for non-Cuda atomic implementation has not been pre-selected.
+// Choose the best implementation for the detected compiler.
+// Preference: GCC, INTEL, OMP31
+
+#if defined( KOKKOS_COMPILER_GNU ) || \
+    defined( KOKKOS_COMPILER_CLANG ) || \
+    ( defined ( KOKKOS_COMPILER_NVCC ) && defined ( __GNUC__ ) )
+
+#define KOKKOS_ATOMICS_USE_GCC
+
+#elif defined( KOKKOS_COMPILER_INTEL ) || \
+      defined( KOKKOS_COMPILER_CRAYC )
+
+#define KOKKOS_ATOMICS_USE_INTEL
+
+#elif defined( _OPENMP ) && ( 201107 <= _OPENMP )
+
+#define KOKKOS_ATOMICS_USE_OMP31
+
+#else
+
+#error "KOKKOS_ATOMICS_USE : Unsupported compiler"
+
+#endif
+
+#endif /* Not pre-selected atomic implementation */
+#endif
+
+//----------------------------------------------------------------------------
+
+// Forward decalaration of functions supporting arbitrary sized atomics
+// This is necessary since Kokkos_Atomic.hpp is internally included very early
+// through Kokkos_HostSpace.hpp as well as the allocation tracker.
+#ifdef KOKKOS_HAVE_CUDA
+namespace Kokkos {
+namespace Impl {
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+__device__ inline
+bool lock_address_cuda_space(void* ptr);
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+__device__ inline
+void unlock_address_cuda_space(void* ptr);
+}
+}
+#endif
+
+
+namespace Kokkos {
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T * const dest, const T src);
+
+// Atomic increment
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* a);
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* a);
+}
+
+#if ! defined(_WIN32)
+#include<impl/Kokkos_Atomic_Assembly_X86.hpp>
+#endif
+
+namespace Kokkos {
+
+
+inline
+const char * atomic_query_version()
+{
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+  return "KOKKOS_ATOMICS_USE_CUDA" ;
+#elif defined( KOKKOS_ATOMICS_USE_GCC )
+  return "KOKKOS_ATOMICS_USE_GCC" ;
+#elif defined( KOKKOS_ATOMICS_USE_INTEL )
+  return "KOKKOS_ATOMICS_USE_INTEL" ;
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+  return "KOKKOS_ATOMICS_USE_OMP31" ;
+#elif defined( KOKKOS_ATOMICS_USE_WINDOWS )
+  return "KOKKOS_ATOMICS_USE_WINDOWS";
+#endif
+}
+
+} // namespace Kokkos
+
+#ifdef _WIN32
+#include "impl/Kokkos_Atomic_Windows.hpp"
+#else
+//#include "impl/Kokkos_Atomic_Assembly_X86.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic exchange
+//
+// template< typename T >
+// T atomic_exchange( volatile T* const dest , const T val )
+// { T tmp = *dest ; *dest = val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Exchange.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic compare-and-exchange
+//
+// template<class T>
+// bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+// { bool equal = compare == *dest ; if ( equal ) { *dest = val ; } return equal ; }
+
+#include "impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and add
+//
+// template<class T>
+// T atomic_fetch_add(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest += val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Add.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and sub
+//
+// template<class T>
+// T atomic_fetch_sub(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest -= val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Sub.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and or
+//
+// template<class T>
+// T atomic_fetch_or(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp | val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_Or.hpp"
+
+//----------------------------------------------------------------------------
+// Atomic fetch and and
+//
+// template<class T>
+// T atomic_fetch_and(volatile T* const dest, const T val)
+// { T tmp = *dest ; *dest = tmp & val ; return tmp ; }
+
+#include "impl/Kokkos_Atomic_Fetch_And.hpp"
+#endif /*Not _WIN32*/
+
+//----------------------------------------------------------------------------
+// Memory fence
+//
+// All loads and stores from this thread will be globally consistent before continuing
+//
+// void memory_fence() {...};
+#include "impl/Kokkos_Memory_Fence.hpp"
+
+//----------------------------------------------------------------------------
+// Provide volatile_load and safe_load
+//
+// T volatile_load(T const volatile * const ptr);
+//
+// T const& safe_load(T const * const ptr);
+// XEON PHI
+// T safe_load(T const * const ptr
+
+#include "impl/Kokkos_Volatile_Load.hpp"
+
+#ifndef _WIN32
+#include "impl/Kokkos_Atomic_Generic.hpp"
+#endif
+//----------------------------------------------------------------------------
+// This atomic-style macro should be an inlined function, not a macro
+
+#if defined( KOKKOS_COMPILER_GNU ) && !defined(__PGIC__)
+
+  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) __builtin_prefetch(addr,0,0)
+  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) __builtin_prefetch(addr,1,0)
+
+#else
+
+  #define KOKKOS_NONTEMPORAL_PREFETCH_LOAD(addr) ((void)0)
+  #define KOKKOS_NONTEMPORAL_PREFETCH_STORE(addr) ((void)0)
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_ATOMIC_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..c521e23159884744c21adb368f43247944c95e91
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Core.hpp
@@ -0,0 +1,228 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_HPP
+#define KOKKOS_CORE_HPP
+
+//----------------------------------------------------------------------------
+// Include the execution space header files for the enabled execution spaces.
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+#include <Kokkos_Cuda.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+#include <Kokkos_OpenMP.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+#include <Kokkos_Serial.hpp>
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+#include <Kokkos_Threads.hpp>
+#endif
+
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_Vectorization.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <Kokkos_hwloc.hpp>
+
+#include <iostream>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct InitArguments {
+  int num_threads;
+  int num_numa;
+  int device_id;
+
+  InitArguments() {
+    num_threads = -1;
+    num_numa = -1;
+    device_id = -1;
+  }
+};
+
+void initialize(int& narg, char* arg[]);
+
+void initialize(const InitArguments& args = InitArguments());
+
+/** \brief  Finalize the spaces that were initialized via Kokkos::initialize */
+void finalize();
+
+/** \brief  Finalize all known execution spaces */
+void finalize_all();
+
+void fence();
+
+}
+
+#ifdef KOKKOS_HAVE_CXX11
+namespace Kokkos {
+
+namespace Impl {
+// should only by used by kokkos_malloc and kokkos_free
+struct MallocHelper
+{
+  static void increment_ref_count( AllocationTracker const & tracker )
+  {
+    tracker.increment_ref_count();
+  }
+
+  static void decrement_ref_count( AllocationTracker const & tracker )
+  {
+    tracker.decrement_ref_count();
+  }
+};
+} // namespace Impl
+
+/* Allocate memory from a memory space.
+ * The allocation is tracked in Kokkos memory tracking system, so
+ * leaked memory can be identified.
+ */
+template< class Arg = DefaultExecutionSpace>
+void* kokkos_malloc(const std::string label, size_t count) {
+  typedef typename Arg::memory_space MemorySpace;
+  Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);;
+  Impl::MallocHelper::increment_ref_count( tracker );
+  return tracker.alloc_ptr();
+}
+
+template< class Arg = DefaultExecutionSpace>
+void* kokkos_malloc(const size_t& count) {
+  return kokkos_malloc<Arg>("DefaultLabel",count);
+}
+
+
+/* Free memory from a memory space.
+ */
+template< class Arg = DefaultExecutionSpace>
+void kokkos_free(const void* ptr) {
+  typedef typename Arg::memory_space MemorySpace;
+  typedef typename MemorySpace::allocator allocator;
+  Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr);
+  if (tracker.is_valid()) {
+    Impl::MallocHelper::decrement_ref_count( tracker );
+  }
+}
+
+
+template< class Arg = DefaultExecutionSpace>
+const void* kokkos_realloc(const void* old_ptr, size_t size) {
+  typedef typename Arg::memory_space MemorySpace;
+  typedef typename MemorySpace::allocator allocator;
+  Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
+
+  tracker.reallocate(size);
+
+  return tracker.alloc_ptr();
+}
+
+} // namespace Kokkos
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void * kokkos_malloc( const size_t arg_alloc_size )
+{
+  typedef typename Space::memory_space  MemorySpace ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void >         RecordBase ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void >  RecordHost ;
+
+  RecordHost * const r = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size );
+
+  RecordBase::increment( r );
+
+  return r->data();
+}
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void kokkos_free( void * arg_alloc )
+{
+  typedef typename Space::memory_space  MemorySpace ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void >         RecordBase ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void >  RecordHost ;
+
+  RecordHost * const r = RecordHost::get_record( arg_alloc );
+
+  RecordBase::decrement( r );
+}
+
+template< class Space = typename Kokkos::DefaultExecutionSpace::memory_space >
+inline
+void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
+{
+  typedef typename Space::memory_space  MemorySpace ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void >         RecordBase ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void >  RecordHost ;
+
+  RecordHost * const r_old = RecordHost::get_record( arg_alloc );
+  RecordHost * const r_new = RecordHost::allocate( MemorySpace() , "kokkos_malloc" , arg_alloc_size );
+
+  Kokkos::Impl::DeepCopy<MemorySpace,MemorySpace>( r_new->data() , r_old->data()
+                                                 , std::min( r_old->size() , r_new->size() ) );
+
+  RecordBase::increment( r_new );
+  RecordBase::decrement( r_old );
+
+  return r_new->data();
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif
+
diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..2cde9299a4d070abb41712a3a352f20bb1b81530
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp
@@ -0,0 +1,170 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CORE_FWD_HPP
+#define KOKKOS_CORE_FWD_HPP
+
+//----------------------------------------------------------------------------
+// Kokkos_Macros.hpp does introspection on configuration options
+// and compiler environment then sets a collection of #define macros.
+
+#include <Kokkos_Macros.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Forward declarations for class inter-relationships
+
+namespace Kokkos {
+
+class HostSpace ; ///< Memory space for main process and CPU execution spaces
+
+#if defined( KOKKOS_HAVE_SERIAL )
+class Serial ;    ///< Execution space main process on CPU
+#endif // defined( KOKKOS_HAVE_SERIAL )
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+class Threads ;  ///< Execution space with pthreads back-end
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+class OpenMP ; ///< OpenMP execution space
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+class CudaSpace ;            ///< Memory space on Cuda GPU
+class CudaUVMSpace ;         ///< Memory space on Cuda GPU with UVM
+class CudaHostPinnedSpace ;  ///< Memory space on Host accessible to Cuda GPU
+class Cuda ;                 ///< Execution space for Cuda GPU
+#endif
+
+template<class ExecutionSpace, class MemorySpace>
+struct Device;
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Set the default execution space.
+
+/// Define Kokkos::DefaultExecutionSpace as per configuration option
+/// or chosen from the enabled execution spaces in the following order:
+/// Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Serial
+
+namespace Kokkos {
+
+#if   defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
+  typedef Cuda DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultExecutionSpace ;
+#else
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#endif
+
+#if defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Threads DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_OPENMP )
+  typedef OpenMP DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_PTHREAD )
+  typedef Threads DefaultHostExecutionSpace ;
+#elif defined ( KOKKOS_HAVE_SERIAL )
+  typedef Serial DefaultHostExecutionSpace ;
+#else
+#  error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
+#endif
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Detect the active execution space and define its memory space.
+// This is used to verify whether a running kernel can access
+// a given memory space.
+
+namespace Kokkos {
+namespace Impl {
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_HAVE_CUDA)
+typedef Kokkos::CudaSpace  ActiveExecutionMemorySpace ;
+#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+typedef Kokkos::HostSpace  ActiveExecutionMemorySpace ;
+#else
+typedef void ActiveExecutionMemorySpace ;
+#endif
+
+template< class ActiveSpace , class MemorySpace >
+struct VerifyExecutionCanAccessMemorySpace {
+  enum {value = 0};
+};
+
+template< class Space >
+struct VerifyExecutionCanAccessMemorySpace< Space , Space >
+{
+  enum {value = 1};
+  KOKKOS_INLINE_FUNCTION static void verify(void) {}
+  KOKKOS_INLINE_FUNCTION static void verify(const void *) {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
+
+#define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
+
+namespace Kokkos {
+  void fence();
+}
+
+#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..d736459b54ffffddf3b1a5f087cf8e55cb97b410
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp
@@ -0,0 +1,268 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDA_HPP
+#define KOKKOS_CUDA_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+// If CUDA execution space is enabled then use this header file.
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <iosfwd>
+#include <vector>
+
+#include <Kokkos_CudaSpace.hpp>
+
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class CudaExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class Cuda
+/// \brief Kokkos Execution Space that uses CUDA to run on GPUs.
+///
+/// An "execution space" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads execution space uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP execution space uses the OpenMP language
+/// extensions, and the Serial execution space executes "parallel" kernels
+/// sequentially.  The Cuda execution space uses NVIDIA's CUDA programming
+/// model to execute kernels in parallel on GPUs.
+class Cuda {
+public:
+  //! \name Type declarations that all Kokkos execution spaces must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef Cuda                  execution_space ;
+
+#if defined( KOKKOS_USE_CUDA_UVM )
+  //! This execution space's preferred memory space.
+  typedef CudaUVMSpace          memory_space ;
+#else
+  //! This execution space's preferred memory space.
+  typedef CudaSpace             memory_space ;
+#endif
+
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  //! The size_type best suited for this execution space.
+  typedef memory_space::size_type  size_type ;
+
+  //! This execution space's preferred array layout.
+  typedef LayoutLeft            array_layout ;
+
+  //! 
+  typedef ScratchMemorySpace< Cuda >  scratch_memory_space ;
+
+  //@}
+  //--------------------------------------------------
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  KOKKOS_INLINE_FUNCTION static int in_parallel() {
+#if defined( __CUDA_ARCH__ )
+    return true;
+#else
+    return false;
+#endif
+  }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  //! Free any resources being consumed by the device.
+  static void finalize();
+
+  //! Has been initialized
+  static int is_initialized();
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  //--------------------------------------------------
+  //! \name  Cuda space instances
+
+  ~Cuda() {}
+  Cuda();
+  explicit Cuda( const int instance_id );
+
+  Cuda( const Cuda & ) = default ;
+  Cuda( Cuda && ) = default ;
+  Cuda & operator = ( const Cuda & ) = default ;
+  Cuda & operator = ( Cuda && ) = default ;
+
+  //--------------------------------------------------------------------------
+  //! \name Device-specific functions
+  //@{
+
+  struct SelectDevice {
+    int cuda_device_id ;
+    SelectDevice() : cuda_device_id(0) {}
+    explicit SelectDevice( int id ) : cuda_device_id( id ) {}
+  };
+
+  //! Initialize, telling the CUDA run-time library which device to use.
+  static void initialize( const SelectDevice = SelectDevice()
+                        , const size_t num_instances = 1 );
+
+  /// \brief Cuda device architecture of the selected device.
+  ///
+  /// This matches the __CUDA_ARCH__ specification.
+  static size_type device_arch();
+
+  //! Query device count.
+  static size_type detect_device_count();
+
+  /** \brief  Detect the available devices and their architecture
+   *          as defined by the __CUDA_ARCH__ specification.
+   */
+  static std::vector<unsigned> detect_device_arch();
+
+  cudaStream_t cuda_stream() const { return m_stream ; }
+  int          cuda_device() const { return m_device ; }
+
+  //@}
+  //--------------------------------------------------------------------------
+
+private:
+
+  cudaStream_t m_stream ;
+  int          m_device ;
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::CudaSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::HostSpace
+  , Kokkos::Cuda::scratch_memory_space
+  >
+{
+  enum { value = false };
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <Cuda/Kokkos_CudaExec.hpp>
+#include <Cuda/Kokkos_Cuda_View.hpp>
+
+#include <KokkosExp_View.hpp>
+#include <Cuda/KokkosExp_Cuda_View.hpp>
+
+#include <Cuda/Kokkos_Cuda_Parallel.hpp>
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #ifndef KOKKOS_CUDA_HPP */
+
+
+
diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..34915fd382a4bfb4e9b282d678624f93edff03d4
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp
@@ -0,0 +1,656 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_CUDASPACE_HPP
+#define KOKKOS_CUDASPACE_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_CUDA )
+
+#include <iosfwd>
+#include <typeinfo>
+#include <string>
+
+#include <Kokkos_HostSpace.hpp>
+
+#include <impl/Kokkos_AllocationTracker.hpp>
+
+#include <Cuda/Kokkos_Cuda_abort.hpp>
+#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Cuda on-device memory management */
+
+class CudaSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef CudaSpace             memory_space ;
+  typedef Kokkos::Cuda          execution_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef unsigned int          size_type ;
+
+  typedef Impl::CudaMallocAllocator allocator;
+
+  /** \brief  Allocate a contiguous block of memory.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   */
+  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
+
+  /*--------------------------------*/
+  /** \brief  Cuda specific function to attached texture object to an allocation.
+   *          Output the texture object, base pointer, and offset from the input pointer.
+   */
+#if defined( __CUDACC__ )
+  static void texture_object_attach(  Impl::AllocationTracker const & tracker
+                                    , unsigned type_size
+                                    , ::cudaChannelFormatDesc const & desc
+                                   );
+#endif
+
+  /*--------------------------------*/
+
+  CudaSpace();
+  CudaSpace( const CudaSpace & rhs ) = default ;
+  CudaSpace & operator = ( const CudaSpace & rhs ) = default ;
+  ~CudaSpace() = default ;
+
+  /**\brief  Allocate memory in the cuda space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate memory in the cuda space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /*--------------------------------*/
+  /** \brief  Error reporting for HostSpace attempt to access CudaSpace */
+  static void access_error();
+  static void access_error( const void * const );
+
+private:
+
+  int  m_device ; ///< Which Cuda device
+
+  // friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > ;
+};
+
+namespace Impl {
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_array_cuda_space();
+
+/// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function retrieves the lock array pointer.
+/// If the array is not yet allocated it will do so.
+int* lock_array_cuda_space_ptr(bool deallocate = false);
+}
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Cuda memory that is accessible to Host execution space
+ *          through Cuda's unified virtual memory (UVM) runtime.
+ */
+class CudaUVMSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef CudaUVMSpace          memory_space ;
+  typedef Cuda                  execution_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef unsigned int          size_type ;
+
+  /** \brief  If UVM capability is available */
+  static bool available();
+
+  typedef Impl::CudaUVMAllocator allocator;
+
+  /** \brief  Allocate a contiguous block of memory.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   */
+  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
+
+
+  /** \brief  Cuda specific function to attached texture object to an allocation.
+   *          Output the texture object, base pointer, and offset from the input pointer.
+   */
+#if defined( __CUDACC__ )
+  static void texture_object_attach(  Impl::AllocationTracker const & tracker
+                                    , unsigned type_size
+                                    , ::cudaChannelFormatDesc const & desc
+                                   );
+#endif
+  /*--------------------------------*/
+
+  CudaUVMSpace();
+  CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
+  CudaUVMSpace & operator = ( const CudaUVMSpace & rhs ) = default ;
+  ~CudaUVMSpace() = default ;
+
+  /**\brief  Allocate memory in the cuda space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate memory in the cuda space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /*--------------------------------*/
+
+private:
+
+  int  m_device ; ///< Which Cuda device
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Host memory that is accessible to Cuda execution space
+ *          through Cuda's host-pinned memory allocation.
+ */
+class CudaHostPinnedSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  /** \brief  Memory is in HostSpace so use the HostSpace::execution_space */
+  typedef HostSpace::execution_space  execution_space ;
+  typedef CudaHostPinnedSpace         memory_space ;
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+  typedef unsigned int                size_type ;
+
+
+  typedef Impl::CudaHostAllocator allocator ;
+
+  /** \brief  Allocate a contiguous block of memory.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   */
+  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
+
+  /*--------------------------------*/
+
+  CudaHostPinnedSpace();
+  CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;
+  CudaHostPinnedSpace & operator = ( const CudaHostPinnedSpace & rhs ) = default ;
+  ~CudaHostPinnedSpace() = default ;
+
+  /**\brief  Allocate memory in the cuda space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate memory in the cuda space */
+  void deallocate( void * const arg_alloc_ptr
+                 , const size_t arg_alloc_size ) const ;
+
+  /*--------------------------------*/
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<> struct DeepCopy< CudaSpace , CudaSpace >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< CudaSpace , HostSpace >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< HostSpace , CudaSpace >
+{
+  DeepCopy( void * dst , const void * src , size_t );
+  DeepCopy( const Cuda & , void * dst , const void * src , size_t );
+};
+
+template<> struct DeepCopy< CudaSpace , CudaUVMSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaSpace , CudaHostPinnedSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
+};
+
+
+template<> struct DeepCopy< CudaUVMSpace , CudaSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaUVMSpace , CudaUVMSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaUVMSpace , CudaHostPinnedSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaUVMSpace , HostSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< CudaSpace , HostSpace >( dst , src , n ); }
+};
+
+
+template<> struct DeepCopy< CudaHostPinnedSpace , CudaSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaHostPinnedSpace , CudaUVMSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaHostPinnedSpace , CudaHostPinnedSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< CudaHostPinnedSpace , HostSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
+};
+
+
+template<> struct DeepCopy< HostSpace , CudaUVMSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , CudaSpace >( dst , src , n ); }
+};
+
+template<> struct DeepCopy< HostSpace , CudaHostPinnedSpace >
+{
+  inline
+  DeepCopy( void * dst , const void * src , size_t n )
+  { (void) DeepCopy< HostSpace , HostSpace >( dst , src , n ); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** Running in CudaSpace attempting to access HostSpace: error */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::HostSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("Cuda code attempted to access HostSpace memory"); }
+};
+
+/** Running in CudaSpace accessing CudaUVMSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaUVMSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::CudaSpace , Kokkos::CudaHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) { }
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) { }
+};
+
+/** Running in CudaSpace attempting to access an unknown space: error */
+template< class OtherSpace >
+struct VerifyExecutionCanAccessMemorySpace<
+  typename enable_if< ! is_same<Kokkos::CudaSpace,OtherSpace>::value , Kokkos::CudaSpace >::type ,
+  OtherSpace >
+{
+  enum { value = false };
+  KOKKOS_INLINE_FUNCTION static void verify( void )
+    { Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
+
+  KOKKOS_INLINE_FUNCTION static void verify( const void * )
+    { Kokkos::abort("Cuda code attempted to access unknown Space memory"); }
+};
+
+//----------------------------------------------------------------------------
+/** Running in HostSpace attempting to access CudaSpace */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaSpace >
+{
+  enum { value = false };
+  inline static void verify( void ) { CudaSpace::access_error(); }
+  inline static void verify( const void * p ) { CudaSpace::access_error(p); }
+};
+
+/** Running in HostSpace accessing CudaUVMSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaUVMSpace >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+/** Running in HostSpace accessing CudaHostPinnedSpace is OK */
+template<>
+struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::CudaHostPinnedSpace >
+{
+  enum { value = true };
+  KOKKOS_INLINE_FUNCTION static void verify( void ) {}
+  KOKKOS_INLINE_FUNCTION static void verify( const void * ) {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  friend class SharedAllocationRecord< Kokkos::CudaUVMSpace , void > ;
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static ::cudaTextureObject_t
+  attach_texture_object( const unsigned sizeof_alias
+                       , void * const   alloc_ptr
+                       , const size_t   alloc_size ); 
+
+  static RecordBase s_root_record ;
+
+  ::cudaTextureObject_t   m_tex_obj ;
+  const Kokkos::CudaSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaSpace &  arg_space
+                                          , const std::string       &  arg_label
+                                          , const size_t               arg_alloc_size
+                                          );
+
+  template< typename AliasType >
+  inline
+  ::cudaTextureObject_t attach_texture_object()
+    {
+      static_assert( ( std::is_same< AliasType , int >::value ||
+                       std::is_same< AliasType , ::int2 >::value ||
+                       std::is_same< AliasType , ::int4 >::value )
+                   , "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" );
+
+      if ( m_tex_obj == 0 ) {
+        m_tex_obj = attach_texture_object( sizeof(AliasType)
+                                         , (void*) RecordBase::m_alloc_ptr
+                                         , RecordBase::m_alloc_size );
+      }
+
+      return m_tex_obj ;
+    }
+
+  template< typename AliasType >
+  inline
+  int attach_texture_object_offset( const AliasType * const ptr )
+    {
+      // Texture object is attached to the entire allocation range
+      return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
+    }
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::CudaSpace & , bool detail = false );
+};
+
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaUVMSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  ::cudaTextureObject_t      m_tex_obj ;
+  const Kokkos::CudaUVMSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaUVMSpace     & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaUVMSpace &  arg_space
+                                          , const std::string          &  arg_label
+                                          , const size_t                  arg_alloc_size
+                                          );
+
+  template< typename AliasType >
+  inline
+  ::cudaTextureObject_t attach_texture_object()
+    {
+      static_assert( ( std::is_same< AliasType , int >::value ||
+                       std::is_same< AliasType , ::int2 >::value ||
+                       std::is_same< AliasType , ::int4 >::value )
+                   , "Cuda texture fetch only supported for alias types of int, ::int2, or ::int4" );
+
+      if ( m_tex_obj == 0 ) {
+        m_tex_obj = SharedAllocationRecord< Kokkos::CudaSpace , void >::
+          attach_texture_object( sizeof(AliasType)
+                               , (void*) RecordBase::m_alloc_ptr
+                               , RecordBase::m_alloc_size );
+      }
+
+      return m_tex_obj ;
+    }
+
+  template< typename AliasType >
+  inline
+  int attach_texture_object_offset( const AliasType * const ptr )
+    {
+      // Texture object is attached to the entire allocation range
+      return ptr - reinterpret_cast<AliasType*>( RecordBase::m_alloc_ptr );
+    }
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::CudaUVMSpace & , bool detail = false );
+};
+
+template<>
+class SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  static RecordBase s_root_record ;
+
+  const Kokkos::CudaHostPinnedSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() : RecordBase(), m_space() {}
+
+  SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace     & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  std::string get_label() const ;
+
+  static SharedAllocationRecord * allocate( const Kokkos::CudaHostPinnedSpace &  arg_space
+                                          , const std::string          &  arg_label
+                                          , const size_t                  arg_alloc_size
+                                          );
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::CudaHostPinnedSpace & , bool detail = false );
+};
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) */
+#endif /* #define KOKKOS_CUDASPACE_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..807cb5cb435d5be51456492a7f8b0559d55d3382
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp
@@ -0,0 +1,497 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXECPOLICY_HPP
+#define KOKKOS_EXECPOLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Execution policy for work over a range of an integral type.
+ *
+ * Valid template argument options:
+ *
+ *  With a specified execution space:
+ *    < ExecSpace , WorkTag , { IntConst | IntType } >
+ *    < ExecSpace , WorkTag , void >
+ *    < ExecSpace , { IntConst | IntType } , void >
+ *    < ExecSpace , void , void >
+ *
+ *  With the default execution space:
+ *    < WorkTag , { IntConst | IntType } , void >
+ *    < WorkTag , void , void >
+ *    < { IntConst | IntType } , void , void >
+ *    < void , void , void >
+ *
+ *  IntType  is a fundamental integral type
+ *  IntConst is an Impl::integral_constant< IntType , Blocking >
+ *
+ *  Blocking is the granularity of partitioning the range among threads.
+ */
+template< class Arg0 = void , class Arg1 = void , class Arg2 = void 
+        , class ExecSpace =
+          // The first argument is the execution space,
+          // otherwise use the default execution space.
+          typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
+                             , Kokkos::DefaultExecutionSpace >::type
+        >
+class RangePolicy {
+private:
+
+  // Default integral type and blocking factor:
+  typedef int DefaultIntType ;
+  enum { DefaultIntValue = 8 };
+
+  enum { Arg0_Void = Impl::is_same< Arg0 , void >::value };
+  enum { Arg1_Void = Impl::is_same< Arg1 , void >::value };
+  enum { Arg2_Void = Impl::is_same< Arg2 , void >::value };
+
+  enum { Arg0_ExecSpace = Impl::is_execution_space< Arg0 >::value };
+
+  enum { Arg0_IntConst = Impl::is_integral_constant< Arg0 >::value };
+  enum { Arg1_IntConst = Impl::is_integral_constant< Arg1 >::value };
+  enum { Arg2_IntConst = Impl::is_integral_constant< Arg2 >::value };
+
+  enum { Arg0_IntType = Impl::is_integral< Arg0 >::value };
+  enum { Arg1_IntType = Impl::is_integral< Arg1 >::value };
+  enum { Arg2_IntType = Impl::is_integral< Arg2 >::value };
+
+  enum { Arg0_WorkTag = ! Arg0_ExecSpace && ! Arg0_IntConst && ! Arg0_IntType && ! Arg0_Void };
+  enum { Arg1_WorkTag =   Arg0_ExecSpace && ! Arg1_IntConst && ! Arg1_IntType && ! Arg1_Void };
+
+  enum { ArgOption_OK = Impl::StaticAssert< (
+    ( Arg0_ExecSpace && Arg1_WorkTag && ( Arg2_IntConst || Arg2_IntType ) ) ||
+    ( Arg0_ExecSpace && Arg1_WorkTag && Arg2_Void ) ||
+    ( Arg0_ExecSpace && ( Arg1_IntConst || Arg1_IntType ) && Arg2_Void ) ||
+    ( Arg0_ExecSpace && Arg1_Void && Arg2_Void ) ||
+    ( Arg0_WorkTag && ( Arg1_IntConst || Arg1_IntType ) && Arg2_Void ) ||
+    ( Arg0_WorkTag && Arg1_Void && Arg2_Void ) ||
+    ( ( Arg0_IntConst || Arg0_IntType ) && Arg1_Void && Arg2_Void ) ||
+    ( Arg0_Void && Arg1_Void && Arg2_Void )
+    ) >::value };
+
+  // The work argument tag is the first or second argument
+  typedef typename Impl::if_c< Arg0_WorkTag , Arg0 ,
+          typename Impl::if_c< Arg1_WorkTag , Arg1 , void
+          >::type >::type
+    WorkTag ;
+
+  enum { Granularity = Arg0_IntConst ? unsigned(Impl::is_integral_constant<Arg0>::integral_value) : (
+                       Arg1_IntConst ? unsigned(Impl::is_integral_constant<Arg1>::integral_value) : (
+                       Arg2_IntConst ? unsigned(Impl::is_integral_constant<Arg2>::integral_value) : (
+                                       unsigned(DefaultIntValue) ))) };
+
+  // Only accept the integral type if the blocking is a power of two
+  typedef typename Impl::enable_if< Impl::is_power_of_two< Granularity >::value ,
+            typename Impl::if_c< Arg0_IntType , Arg0 ,
+            typename Impl::if_c< Arg1_IntType , Arg1 ,
+            typename Impl::if_c< Arg2_IntType , Arg2 ,
+            typename Impl::if_c< Arg0_IntConst , typename Impl::is_integral_constant<Arg0>::integral_type ,
+            typename Impl::if_c< Arg1_IntConst , typename Impl::is_integral_constant<Arg1>::integral_type ,
+            typename Impl::if_c< Arg2_IntConst , typename Impl::is_integral_constant<Arg2>::integral_type ,
+                                                 DefaultIntType
+            >::type >::type >::type
+            >::type >::type >::type
+          >::type
+    IntType ;
+
+  enum { GranularityMask = IntType(Granularity) - 1 };
+
+  ExecSpace m_space ;
+  IntType   m_begin ;
+  IntType   m_end ;
+
+public:
+
+  //! Tag this class as an execution policy
+  typedef ExecSpace    execution_space ;
+  typedef RangePolicy  execution_policy ;
+  typedef WorkTag      work_tag ;
+  typedef IntType      member_type ;
+
+  KOKKOS_INLINE_FUNCTION const execution_space & space() const { return m_space ; }
+  KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
+  KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
+
+  inline RangePolicy() : m_space(), m_begin(0), m_end(0) {}
+
+  /** \brief  Total range */
+  inline
+  RangePolicy( const member_type work_begin
+             , const member_type work_end
+             )
+    : m_space()
+    , m_begin( work_begin < work_end ? work_begin : 0 )
+    , m_end(   work_begin < work_end ? work_end : 0 )
+    {}
+
+  /** \brief  Total range */
+  inline
+  RangePolicy( const execution_space & work_space
+             , const member_type work_begin
+             , const member_type work_end
+             )
+    : m_space( work_space )
+    , m_begin( work_begin < work_end ? work_begin : 0 )
+    , m_end(   work_begin < work_end ? work_end : 0 )
+    {}
+
+  /** \brief  Subrange for a partition's rank and size.
+   *
+   *  Typically used to partition a range over a group of threads.
+   */
+  struct WorkRange {
+    typedef RangePolicy::work_tag     work_tag ;
+    typedef RangePolicy::member_type  member_type ;
+
+    KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin ; }
+    KOKKOS_INLINE_FUNCTION member_type end()   const { return m_end ; }
+
+    /** \brief  Subrange for a partition's rank and size.
+     *
+     *  Typically used to partition a range over a group of threads.
+     */
+    KOKKOS_INLINE_FUNCTION
+    WorkRange( const RangePolicy & range
+             , const int part_rank
+             , const int part_size
+             )
+      : m_begin(0), m_end(0)
+      {
+        if ( part_size ) {
+  
+          // Split evenly among partitions, then round up to the granularity.
+          const member_type work_part =
+            ( ( ( ( range.end() - range.begin() ) + ( part_size - 1 ) ) / part_size )
+              + GranularityMask ) & ~member_type(GranularityMask);
+
+          m_begin = range.begin() + work_part * part_rank ;
+          m_end   = m_begin       + work_part ;
+  
+          if ( range.end() < m_begin ) m_begin = range.end() ;
+          if ( range.end() < m_end )   m_end   = range.end() ;
+        }
+      }
+  private:
+     member_type m_begin ;
+     member_type m_end ;
+     WorkRange();
+     WorkRange & operator = ( const WorkRange & );
+  };
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Execution policy for parallel work over a league of teams of threads.
+ *
+ *  The work functor is called for each thread of each team such that
+ *  the team's member threads are guaranteed to be concurrent.
+ *
+ *  The team's threads have access to team shared scratch memory and
+ *  team collective operations.
+ *
+ *  If the WorkTag is non-void then the first calling argument of the
+ *  work functor's parentheses operator is 'const WorkTag &'.
+ *  This allows a functor to have multiple work member functions.
+ *
+ *  template argument option with specified execution space:
+ *    < ExecSpace , WorkTag >
+ *    < ExecSpace , void >
+ *
+ *  template argument option with default execution space:
+ *    < WorkTag , void >
+ *    < void , void >
+ */
+template< class Arg0 = void
+        , class Arg1 = void
+        , class ExecSpace =
+          // If the first argument is not an execution
+          // then use the default execution space.
+          typename Impl::if_c< Impl::is_execution_space< Arg0 >::value , Arg0
+                             , Kokkos::DefaultExecutionSpace >::type
+        >
+class TeamPolicy {
+private:
+
+  enum { Arg0_ExecSpace = Impl::is_execution_space< Arg0 >::value };
+  enum { Arg1_Void      = Impl::is_same< Arg1 , void >::value };
+  enum { ArgOption_OK   = Impl::StaticAssert< ( Arg0_ExecSpace || Arg1_Void ) >::value };
+
+  typedef typename Impl::if_c< Arg0_ExecSpace , Arg1 , Arg0 >::type WorkTag ;
+
+public:
+
+  //! Tag this class as an execution policy
+  typedef TeamPolicy  execution_policy ;
+  typedef ExecSpace   execution_space ;
+  typedef WorkTag     work_tag ;
+
+  //----------------------------------------
+  /** \brief  Query maximum team size for a given functor.
+   *
+   *  This size takes into account execution space concurrency limitations and
+   *  scratch memory space limitations for reductions, team reduce/scan, and
+   *  team shared memory.
+   */
+  template< class FunctorType >
+  static int team_size_max( const FunctorType & );
+
+  /** \brief  Query recommended team size for a given functor.
+   *
+   *  This size takes into account execution space concurrency limitations and
+   *  scratch memory space limitations for reductions, team reduce/scan, and
+   *  team shared memory.
+   */
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & );
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & , const int&);
+  //----------------------------------------
+  /** \brief  Construct policy with the given instance of the execution space */
+  TeamPolicy( const execution_space & , int league_size_request , int team_size_request );
+
+  /** \brief  Construct policy with the default instance of the execution space */
+  TeamPolicy( int league_size_request , int team_size_request );
+
+  /** \brief  The actual league size (number of teams) of the policy.
+   *
+   *  This may be smaller than the requested league size due to limitations
+   *  of the execution space.
+   */
+  KOKKOS_INLINE_FUNCTION int league_size() const ;
+
+  /** \brief  The actual team size (number of threads per team) of the policy.
+   *
+   *  This may be smaller than the requested team size due to limitations
+   *  of the execution space.
+   */
+  KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+  /** \brief  Parallel execution of a functor calls the functor once with
+   *          each member of the execution policy.
+   */
+  struct member_type {
+
+    /** \brief  Handle to the currently executing team shared scratch memory */
+    KOKKOS_INLINE_FUNCTION
+    typename execution_space::scratch_memory_space team_shmem() const ;
+
+    /** \brief  Rank of this team within the league of teams */
+    KOKKOS_INLINE_FUNCTION int league_rank() const ;
+
+    /** \brief  Number of teams in the league */
+    KOKKOS_INLINE_FUNCTION int league_size() const ;
+
+    /** \brief  Rank of this thread within this team */
+    KOKKOS_INLINE_FUNCTION int team_rank() const ;
+
+    /** \brief  Number of threads in this team */
+    KOKKOS_INLINE_FUNCTION int team_size() const ;
+
+    /** \brief  Barrier among the threads of this team */
+    KOKKOS_INLINE_FUNCTION void team_barrier() const ;
+
+    /** \brief  Intra-team reduction. Returns join of all values of the team members. */
+    template< class JoinOp >
+    KOKKOS_INLINE_FUNCTION
+    typename JoinOp::value_type team_reduce( const typename JoinOp::value_type
+                                           , const JoinOp & ) const ;
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+     *
+     *  The highest rank thread can compute the reduction total as
+     *    reduction_total = dev.team_scan( value ) + value ;
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const ;
+
+    /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+     *          with intra-team non-deterministic ordering accumulation.
+     *
+     *  The global inter-team accumulation value will, at the end of the
+     *  league's parallel execution, be the scan's total.
+     *  Parallel execution ordering of the league's teams is non-deterministic.
+     *  As such the base value for each team's scan operation is similarly
+     *  non-deterministic.
+     */
+    template< typename Type >
+    KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const ;
+  };
+};
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<typename iType, class TeamMemberType>
+struct TeamThreadRangeBoundariesStruct {
+private:
+
+  KOKKOS_INLINE_FUNCTION static
+  iType ibegin( const iType & arg_begin
+              , const iType & arg_end
+              , const iType & arg_rank
+              , const iType & arg_size
+              )
+    {
+      return arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * arg_rank ;
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  iType iend( const iType & arg_begin
+            , const iType & arg_end
+            , const iType & arg_rank
+            , const iType & arg_size
+            )
+    {
+      const iType end_ = arg_begin + ( ( arg_end - arg_begin + arg_size - 1 ) / arg_size ) * ( arg_rank + 1 );
+      return end_ < arg_end ? end_ : arg_end ;
+    }
+
+public:
+
+  typedef iType index_type;
+  const iType start;
+  const iType end;
+  enum {increment = 1};
+  const TeamMemberType& thread;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
+                                , const iType& arg_end
+                                )
+    : start( ibegin( 0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , end(   iend(   0 , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , thread( arg_thread )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct( const TeamMemberType& arg_thread
+                                , const iType& arg_begin
+                                , const iType& arg_end
+                                )
+    : start( ibegin( arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , end(   iend(   arg_begin , arg_end , arg_thread.team_rank() , arg_thread.team_size() ) )
+    , thread( arg_thread )
+    {}
+};
+
+  template<typename iType, class TeamMemberType>
+  struct ThreadVectorRangeBoundariesStruct {
+    typedef iType index_type;
+    enum {start = 0};
+    const iType end;
+    enum {increment = 1};
+
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const TeamMemberType& thread, const iType& count):
+      end( count )
+    {}
+  };
+
+  template<class TeamMemberType>
+  struct ThreadSingleStruct {
+    const TeamMemberType& team_member;
+    KOKKOS_INLINE_FUNCTION
+    ThreadSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
+  };
+
+  template<class TeamMemberType>
+  struct VectorSingleStruct {
+    const TeamMemberType& team_member;
+    KOKKOS_INLINE_FUNCTION
+    VectorSingleStruct(const TeamMemberType& team_member_):team_member(team_member_){}
+  };
+} // namespace Impl
+
+/** \brief  Execution policy for parallel work over a threads within a team.
+ *
+ *  The range is split over all threads in a team. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a single count. So the range is (0,count].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& count);
+
+/** \brief  Execution policy for parallel work over a threads within a team.
+ *
+ *  The range is split over all threads in a team. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a begin and end. So the range is (begin,end].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,TeamMemberType> TeamThreadRange(const TeamMemberType&, const iType& begin, const iType& end);
+
+/** \brief  Execution policy for a vector parallel loop.
+ *
+ *  The range is split over all vector lanes in a thread. The Mapping scheme depends on the architecture.
+ *  This policy is used together with a parallel pattern as a nested layer within a kernel launched
+ *  with the TeamPolicy. This variant expects a single count. So the range is (0,count].
+ */
+template<typename iType, class TeamMemberType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(const TeamMemberType&, const iType& count);
+
+} // namespace Kokkos
+
+#endif /* #define KOKKOS_EXECPOLICY_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..012743d43ce31af7f1be6a91b9aafa951241b6be
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp
@@ -0,0 +1,270 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HOSTSPACE_HPP
+#define KOKKOS_HOSTSPACE_HPP
+
+#include <cstring>
+#include <string>
+#include <iosfwd>
+#include <typeinfo>
+
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#include <impl/Kokkos_AllocationTracker.hpp>
+#include <impl/Kokkos_BasicAllocators.hpp>
+
+#include <impl/KokkosExp_SharedAlloc.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+/// \brief Initialize lock array for arbitrary size atomics.
+///
+/// Arbitrary atomics are implemented using a hash table of locks
+/// where the hash value is derived from the address of the
+/// object for which an atomic operation is performed.
+/// This function initializes the locks to zero (unset).
+void init_lock_array_host_space();
+
+/// \brief Aquire a lock for the address
+///
+/// This function tries to aquire the lock for the hash value derived
+/// from the provided ptr. If the lock is successfully aquired the
+/// function returns true. Otherwise it returns false.
+bool lock_address_host_space(void* ptr);
+
+/// \brief Release lock for the address
+///
+/// This function releases the lock for the hash value derived
+/// from the provided ptr. This function should only be called
+/// after previously successfully aquiring a lock with
+/// lock_address.
+void unlock_address_host_space(void* ptr);
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+/// \class HostSpace
+/// \brief Memory management for host memory.
+///
+/// HostSpace is a memory space that governs host memory.  "Host"
+/// memory means the usual CPU-accessible memory.
+class HostSpace {
+public:
+
+  //! Tag this class as a kokkos memory space
+  typedef HostSpace  memory_space ;
+  typedef size_t     size_type ;
+
+  /// \typedef execution_space
+  /// \brief Default execution space for this memory space.
+  ///
+  /// Every memory space has a default execution space.  This is
+  /// useful for things like initializing a View (which happens in
+  /// parallel using the View's default execution space).
+#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+  typedef Kokkos::OpenMP   execution_space ;
+#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+  typedef Kokkos::Threads  execution_space ;
+#elif defined( KOKKOS_HAVE_OPENMP )
+  typedef Kokkos::OpenMP   execution_space ;
+#elif defined( KOKKOS_HAVE_PTHREAD )
+  typedef Kokkos::Threads  execution_space ;
+#elif defined( KOKKOS_HAVE_SERIAL )
+  typedef Kokkos::Serial   execution_space ;
+#else
+#  error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads.  You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
+#endif
+
+  //! This memory space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+
+#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
+  typedef Impl::PageAlignedAllocator allocator ;
+#else
+  typedef Impl::AlignedAllocator allocator ;
+#endif
+
+  /** \brief  Allocate a contiguous block of memory.
+   *
+   *  The input label is associated with the block of memory.
+   *  The block of memory is tracked via reference counting where
+   *  allocation gives it a reference count of one.
+   */
+  static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
+
+  /*--------------------------------*/
+  /* Functions unique to the HostSpace */
+  static int in_parallel();
+
+  static void register_in_parallel( int (*)() );
+
+  /*--------------------------------*/
+
+  /**\brief  Default memory space instance */
+  HostSpace();
+  HostSpace( const HostSpace & rhs ) = default ;
+  HostSpace & operator = ( const HostSpace & ) = default ;
+  ~HostSpace() = default ;
+
+  /**\brief  Non-default memory space instance to choose allocation mechansim, if available */
+
+  enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
+
+  explicit
+  HostSpace( const AllocationMechanism & );
+
+  /**\brief  Allocate memory in the host space */
+  void * allocate( const size_t arg_alloc_size ) const ;
+
+  /**\brief  Deallocate memory in the host space */
+  void deallocate( void * const arg_alloc_ptr 
+                 , const size_t arg_alloc_size ) const ;
+
+private:
+
+  AllocationMechanism  m_alloc_mech ;
+
+  friend class Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class SharedAllocationRecord< Kokkos::HostSpace , void >
+  : public SharedAllocationRecord< void , void >
+{
+private:
+
+  friend Kokkos::HostSpace ;
+
+  typedef SharedAllocationRecord< void , void >  RecordBase ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  static void deallocate( RecordBase * );
+
+  /**\brief  Root record for tracked allocations from this HostSpace instance */
+  static RecordBase s_root_record ;
+
+  const Kokkos::HostSpace m_space ;
+
+protected:
+
+  ~SharedAllocationRecord();
+  SharedAllocationRecord() = default ;
+
+  SharedAllocationRecord( const Kokkos::HostSpace        & arg_space
+                        , const std::string              & arg_label
+                        , const size_t                     arg_alloc_size
+                        , const RecordBase::function_type  arg_dealloc = & deallocate
+                        );
+
+public:
+
+  inline
+  std::string get_label() const
+    {
+      return std::string( RecordBase::head()->m_label );
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const Kokkos::HostSpace &  arg_space
+                                   , const std::string       &  arg_label
+                                   , const size_t               arg_alloc_size
+                                   )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
+#else
+      return (SharedAllocationRecord *) 0 ;
+#endif
+    }
+
+
+  static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
+
+  static void print_records( std::ostream & , const Kokkos::HostSpace & , bool detail = false );
+};
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class , class > struct DeepCopy ;
+
+template<>
+struct DeepCopy<HostSpace,HostSpace> {
+  DeepCopy( void * dst , const void * src , size_t n );
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+
+#endif /* #define KOKKOS_HOSTSPACE_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..32822889df28cb7c928d3bf99184249d3cb2748d
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Layout.hpp
@@ -0,0 +1,174 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Layout.hpp
+/// \brief Declaration of various \c MemoryLayout options.
+
+#ifndef KOKKOS_LAYOUT_HPP
+#define KOKKOS_LAYOUT_HPP
+
+#include <stddef.h>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+/// \struct LayoutLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Layout left" indicates a mapping where the leftmost index i0
+/// refers to contiguous access, and strides increase for dimensions
+/// going right from there (i1, i2, ...).  This layout imitates how
+/// Fortran stores multi-dimensional arrays.  For the special case of
+/// a two-dimensional array, "layout left" is also called "column
+/// major."
+struct LayoutLeft {
+  //! Tag this class as a kokkos array layout
+  typedef LayoutLeft array_layout ;
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutRight
+/// \brief Memory layout tag indicating right-to-left (C or
+///   lexigraphical scheme) striding of multi-indices.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Right layout" indicates a mapping where the rightmost index ik
+/// refers to contiguous access, and strides increase for dimensions
+/// going left from there.  This layout imitates how C stores
+/// multi-dimensional arrays.  For the special case of a
+/// two-dimensional array, "layout right" is also called "row major."
+struct LayoutRight {
+  //! Tag this class as a kokkos array layout
+  typedef LayoutRight array_layout ;
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutStride
+/// \brief  Memory layout tag indicated arbitrarily strided
+///         multi-index mapping into contiguous memory.
+struct LayoutStride {
+
+  //! Tag this class as a kokkos array layout
+  typedef LayoutStride array_layout ;
+
+  enum { MAX_RANK = 8 };
+
+  size_t dimension[ MAX_RANK ] ;
+  size_t stride[ MAX_RANK ] ; 
+
+  /** \brief  Compute strides from ordered dimensions.
+   *
+   *  Values of order uniquely form the set [0..rank)
+   *  and specify ordering of the dimensions.
+   *  Order = {0,1,2,...} is LayoutLeft
+   *  Order = {...,2,1,0} is LayoutRight
+   */
+  template< typename iTypeOrder , typename iTypeDimen >
+  KOKKOS_INLINE_FUNCTION static
+  LayoutStride order_dimensions( int const rank
+                               , iTypeOrder const * const order
+                               , iTypeDimen const * const dimen )
+    {
+      LayoutStride tmp ;
+      // Verify valid rank order:
+      int check_input = MAX_RANK < rank ? 0 : int( 1 << rank ) - 1 ;
+      for ( int r = 0 ; r < MAX_RANK ; ++r ) {
+        tmp.dimension[r] = 0 ;
+        tmp.stride[r]    = 0 ;
+        check_input &= ~int( 1 << order[r] );
+      }
+      if ( 0 == check_input ) {
+        size_t n = 1 ;
+        for ( int r = 0 ; r < rank ; ++r ) {
+          tmp.stride[ order[r] ] = n ;
+          n *= ( dimen[order[r]] );
+          tmp.dimension[r] = dimen[r];
+        }
+      }
+      return tmp ;
+    }
+};
+
+//----------------------------------------------------------------------------
+/// \struct LayoutTileLeft
+/// \brief Memory layout tag indicating left-to-right (Fortran scheme)
+///   striding of multi-indices by tiles.
+///
+/// This is an example of a \c MemoryLayout template parameter of
+/// View.  The memory layout describes how View maps from a
+/// multi-index (i0, i1, ..., ik) to a memory location.  
+///
+/// "Tiled layout" indicates a mapping to contiguously stored
+/// <tt>ArgN0</tt> by <tt>ArgN1</tt> tiles for the rightmost two
+/// dimensions.  Indices are LayoutLeft within each tile, and the
+/// tiles themselves are arranged using LayoutLeft.  Note that the
+/// dimensions <tt>ArgN0</tt> and <tt>ArgN1</tt> of the tiles must be
+/// compile-time constants.  This speeds up index calculations.  If
+/// both tile dimensions are powers of two, Kokkos can optimize
+/// further.
+template < unsigned ArgN0 , unsigned ArgN1 ,
+           bool IsPowerOfTwo = ( Impl::is_power_of_two<ArgN0>::value &&
+                                 Impl::is_power_of_two<ArgN1>::value )
+         >
+struct LayoutTileLeft {
+  //! Tag this class as a kokkos array layout
+  typedef LayoutTileLeft<ArgN0,ArgN1,IsPowerOfTwo> array_layout ;
+
+  enum { N0 = ArgN0 };
+  enum { N1 = ArgN1 };
+};
+
+} // namespace Kokkos
+
+#endif // #ifndef KOKKOS_LAYOUT_HPP
+
diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..3978a0622865d89d5f56ddb0a5f641969ed99223
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Macros.hpp
@@ -0,0 +1,397 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MACROS_HPP
+#define KOKKOS_MACROS_HPP
+
+//----------------------------------------------------------------------------
+/** Pick up configure/build options via #define macros:
+ *
+ *  KOKKOS_HAVE_CUDA                Kokkos::Cuda execution and memory spaces
+ *  KOKKOS_HAVE_PTHREAD             Kokkos::Threads execution space
+ *  KOKKOS_HAVE_QTHREAD             Kokkos::Qthread execution space
+ *  KOKKOS_HAVE_OPENMP              Kokkos::OpenMP  execution space
+ *  KOKKOS_HAVE_HWLOC               HWLOC library is available
+ *  KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK    insert array bounds checks, is expensive!
+ *  KOKKOS_HAVE_CXX11               enable C++11 features
+ *
+ *  KOKKOS_HAVE_MPI                 negotiate MPI/execution space interactions
+ *
+ *  KOKKOS_USE_CUDA_UVM             Use CUDA UVM for Cuda memory space
+ */
+
+#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
+#include <KokkosCore_config.h>
+#endif
+
+//----------------------------------------------------------------------------
+/** Pick up compiler specific #define macros:
+ *
+ *  Macros for known compilers evaluate to an integral version value
+ *
+ *  KOKKOS_COMPILER_NVCC
+ *  KOKKOS_COMPILER_GNU
+ *  KOKKOS_COMPILER_INTEL
+ *  KOKKOS_COMPILER_IBM
+ *  KOKKOS_COMPILER_CRAYC
+ *  KOKKOS_COMPILER_APPLECC
+ *  KOKKOS_COMPILER_CLANG
+ *  KOKKOS_COMPILER_PGI
+ *
+ *  Macros for which compiler extension to use for atomics on intrinsice types
+ *
+ *  KOKKOS_ATOMICS_USE_CUDA
+ *  KOKKOS_ATOMICS_USE_GNU
+ *  KOKKOS_ATOMICS_USE_INTEL
+ *  KOKKOS_ATOMICS_USE_OPENMP31
+ *
+ *  A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
+ *
+ *  Macros for marking functions to run in an execution space:
+ *
+ *  KOKKOS_FUNCTION
+ *  KOKKOS_INLINE_FUNCTION        request compiler to inline
+ *  KOKKOS_FORCEINLINE_FUNCTION   force compiler to inline, use with care!
+ */
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ )
+
+/*  Compiling with a CUDA compiler.
+ *
+ *  Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
+ *    CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
+ *
+ *  When generating device code the __CUDA_ARCH__ macro is defined as:
+ *    __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
+ */
+
+#include <cuda_runtime.h>
+#include <cuda.h>
+
+#if ! defined( CUDA_VERSION )
+#error "#include <cuda.h> did not define CUDA_VERSION"
+#endif
+
+#if ( CUDA_VERSION < 6050 )
+// CUDA supports (inofficially) C++11 in device code starting with
+// version 6.5. This includes auto type and device code internal
+// lambdas.
+#error "Cuda version 6.5 or greater required"
+#endif
+
+#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
+/*  Compiling with CUDA compiler for device code. */
+#error "Cuda device capability >= 3.0 is required"
+#endif
+
+#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */
+
+/*--------------------------------------------------------------------------*/
+/* Language info: C++, CUDA, OPENMP */
+
+#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+  // Compiling Cuda code to 'ptx'
+
+  #define KOKKOS_FORCEINLINE_FUNCTION  __device__  __host__  __forceinline__
+  #define KOKKOS_INLINE_FUNCTION       __device__  __host__  inline
+  #define KOKKOS_FUNCTION              __device__  __host__
+
+#endif /* #if defined( __CUDA_ARCH__ ) */
+
+#if defined( _OPENMP )
+
+  /*  Compiling with OpenMP.
+   *  The value of _OPENMP is an integer value YYYYMM
+   *  where YYYY and MM are the year and month designation
+   *  of the supported OpenMP API version.
+   */
+
+#endif /* #if defined( _OPENMP ) */
+
+/*--------------------------------------------------------------------------*/
+/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
+
+#if defined( __NVCC__ )
+  // NVIDIA compiler is being used.
+  // Code is parsed and separated into host and device code.
+  // Host code is compiled again with another compiler.
+  // Device code is compile to 'ptx'.
+  #define KOKKOS_COMPILER_NVCC __NVCC__
+
+#else
+#if defined( KOKKOS_HAVE_CXX11 ) && ! defined( KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA )
+    // CUDA (including version 6.5) does not support giving lambdas as
+    // arguments to global functions. Thus its not currently possible
+    // to dispatch lambdas from the host.
+    #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
+  #endif
+#endif /* #if defined( __NVCC__ ) */
+
+#if defined( KOKKOS_HAVE_CXX11 ) && !defined (KOKKOS_LAMBDA)
+  #define KOKKOS_LAMBDA [=]
+#endif
+
+#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
+
+/* Intel compiler for host code */
+
+#if defined( __INTEL_COMPILER )
+  #define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
+#elif defined( __ICC )
+  // Old define
+  #define KOKKOS_COMPILER_INTEL __ICC
+#elif defined( __ECC )
+  // Very old define
+  #define KOKKOS_COMPILER_INTEL __ECC
+#endif
+
+/* CRAY compiler for host code */
+#if defined( _CRAYC )
+  #define KOKKOS_COMPILER_CRAYC _CRAYC
+#endif
+
+#if defined( __IBMCPP__ )
+  // IBM C++
+  #define KOKKOS_COMPILER_IBM __IBMCPP__
+#elif defined( __IBMC__ )
+  #define KOKKOS_COMPILER_IBM __IBMC__
+#endif
+
+#if defined( __APPLE_CC__ )
+  #define KOKKOS_COMPILER_APPLECC __APPLE_CC__
+#endif
+
+#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL)
+  #define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
+#endif
+
+#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
+  #define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
+#endif
+
+#if defined( __PGIC__ ) && ! defined( __GNUC__ )
+  #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
+#endif
+
+#endif /* #if ! defined( __CUDA_ARCH__ ) */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+/* Intel compiler macros */
+
+#if defined( KOKKOS_COMPILER_INTEL )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  #define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  #define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+#if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+
+  #if ( 1200 <= KOKKOS_COMPILER_INTEL ) && ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #if !defined (_WIN32)
+      #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
+    #else
+      #define KOKKOS_FORCEINLINE_FUNCTION inline
+    #endif
+  #endif
+
+  #if defined( __MIC__ )
+    // Compiling for Xeon Phi
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* Cray compiler macros */
+
+#if defined( KOKKOS_COMPILER_CRAYC )
+
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* IBM Compiler macros */
+
+#if defined( KOKKOS_COMPILER_IBM )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* CLANG compiler macros */
+
+#if defined( KOKKOS_COMPILER_CLANG )
+
+  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #define KOKKOS_FORCEINLINE_FUNCTION  inline __attribute__((always_inline))
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+/* GNU Compiler macros */
+
+#if defined( KOKKOS_COMPILER_GNU )
+
+  //#define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  //#define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  //#define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+  #if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+    #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
+  #endif
+
+  #if ! defined( KOKKOS_ENABLE_ASM ) && \
+      ! ( defined( __powerpc) || \
+          defined(__powerpc__) || \
+          defined(__powerpc64__) || \
+          defined(__POWERPC__) || \
+          defined(__ppc__) || \
+          defined(__ppc64__) || \
+          defined(__PGIC__) )
+    #define KOKKOS_ENABLE_ASM 1
+  #endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_COMPILER_PGI )
+
+  #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #define KOKKOS_HAVE_PRAGMA_IVDEP 1
+  //#define KOKKOS_HAVE_PRAGMA_LOOPCOUNT 1
+  #define KOKKOS_HAVE_PRAGMA_VECTOR 1
+  //#define KOKKOS_HAVE_PRAGMA_SIMD 1
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_COMPILER_NVCC )
+
+  #if defined(__CUDA_ARCH__ )
+    #define KOKKOS_HAVE_PRAGMA_UNROLL 1
+  #endif
+
+#endif
+
+//----------------------------------------------------------------------------
+/** Define function marking macros if compiler specific macros are undefined: */
+
+#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
+#define KOKKOS_FORCEINLINE_FUNCTION  inline
+#endif
+
+#if ! defined( KOKKOS_INLINE_FUNCTION )
+#define KOKKOS_INLINE_FUNCTION  inline
+#endif
+
+#if ! defined( KOKKOS_FUNCTION )
+#define KOKKOS_FUNCTION /**/
+#endif
+
+//----------------------------------------------------------------------------
+/** Determine the default execution space for parallel dispatch.
+ *  There is zero or one default execution space specified.
+ */
+
+#if 1 < ( ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
+          ( defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
+
+#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
+
+#endif
+
+/** If default is not specified then chose from enabled execution spaces.
+ *  Priority: CUDA, OPENMP, THREADS, SERIAL
+ */
+#if   defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA )
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+#elif defined ( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+#elif defined ( KOKKOS_HAVE_CUDA )
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_CUDA
+#elif defined ( KOKKOS_HAVE_OPENMP )
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP
+#elif defined ( KOKKOS_HAVE_PTHREAD )
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS
+#else
+#define KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL
+#endif
+
+//----------------------------------------------------------------------------
+/** Determine for what space the code is being compiled: */
+
+#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_HAVE_CUDA)
+#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
+#else
+#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_MACROS_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..b581c7da23fa0652521ee0d59a510c0769de7312
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp
@@ -0,0 +1,116 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_MEMORYTRAITS_HPP
+#define KOKKOS_MEMORYTRAITS_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Memory access traits for views, an extension point.
+ *
+ *  These traits should be orthogonal.  If there are dependencies then
+ *  the MemoryTraits template must detect and enforce dependencies.
+ *
+ *  A zero value is the default for a View, indicating that none of
+ *  these traits are present.
+ */
+enum MemoryTraitsFlags
+  { Unmanaged  = 0x01
+  , RandomAccess = 0x02
+  , Atomic = 0x04
+  };
+
+template < unsigned T >
+struct MemoryTraits {
+  //! Tag this class as a kokkos memory traits:
+  typedef MemoryTraits memory_traits ;
+
+  enum { Unmanaged    = T & unsigned(Kokkos::Unmanaged) };
+  enum { RandomAccess = T & unsigned(Kokkos::RandomAccess) };
+  enum { Atomic       = T & unsigned(Kokkos::Atomic) };
+
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+typedef Kokkos::MemoryTraits<0> MemoryManaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged > MemoryUnmanaged ;
+typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryRandomAccess ;
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief Memory alignment settings
+ *
+ *  Sets global value for memory alignment.  Must be a power of two!
+ *  Enable compatibility of views from different devices with static stride.
+ *  Use compiler flag to enable overwrites.
+ */
+enum { MEMORY_ALIGNMENT =
+#if defined( KOKKOS_MEMORY_ALIGNMENT )
+    ( 1 << Kokkos::Impl::power_of_two< KOKKOS_MEMORY_ALIGNMENT >::value )
+#else
+    ( 1 << Kokkos::Impl::power_of_two< 128 >::value )
+#endif
+  , MEMORY_ALIGNMENT_THRESHOLD = 4 
+  };
+
+
+} //namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_MEMORYTRAITS_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..508da04c87ad7b9ea459b8ca1dde8f310587c59e
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_OpenMP.hpp
@@ -0,0 +1,175 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_HPP
+#define KOKKOS_OPENMP_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP )
+
+#include <omp.h>
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/// \class OpenMP
+/// \brief Kokkos device for multicore processors in the host memory space.
+class OpenMP {
+public:
+  //------------------------------------
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as a kokkos execution space
+  typedef OpenMP                execution_space ;
+  typedef HostSpace             memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef LayoutRight           array_layout ;
+  typedef HostSpace::size_type  size_type ;
+
+  typedef ScratchMemorySpace< OpenMP > scratch_memory_space ;
+
+  //@}
+  //------------------------------------
+  //! \name Functions that all Kokkos devices must implement.
+  //@{
+
+  inline static bool in_parallel() { return omp_in_parallel(); }
+
+  /** \brief  Set the device in a "sleep" state. A noop for OpenMP.  */
+  static bool sleep();
+
+  /** \brief Wake the device from the 'sleep' state. A noop for OpenMP. */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */
+  static void fence() {}
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  /// \brief Free any resources being consumed by the device.
+  static void finalize();
+
+  /** \brief  Initialize the device.
+   *
+   *  1) If the hardware locality library is enabled and OpenMP has not
+   *     already bound threads then bind OpenMP threads to maximize
+   *     core utilization and group for memory hierarchy locality.
+   *
+   *  2) Allocate a HostThread for each OpenMP thread to hold its
+   *     topology and fan in/out data.
+   */
+  static void initialize( unsigned thread_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 );
+
+  static int is_initialized();
+  //@}
+  //------------------------------------
+  /** \brief  This execution space has a topological thread pool which can be queried.
+   *
+   *  All threads within a pool have a common memory space for which they are cache coherent.
+   *    depth = 0  gives the number of threads in the whole pool.
+   *    depth = 1  gives the number of threads in a NUMA region, typically sharing L3 cache.
+   *    depth = 2  gives the number of threads at the finest granularity, typically sharing L1 cache.
+   */
+  inline static int thread_pool_size( int depth = 0 );
+
+  /** \brief  The rank of the executing thread in this thread pool */
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank();
+
+  //------------------------------------
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+
+  KOKKOS_INLINE_FUNCTION static
+  unsigned hardware_thread_id() { return thread_pool_rank(); }
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::OpenMP::memory_space
+  , Kokkos::OpenMP::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <OpenMP/Kokkos_OpenMPexec.hpp>
+#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( _OPENMP ) */
+#endif /* #ifndef KOKKOS_OPENMP_HPP */
+
+
diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..52de637a56dcf4e47ed1a6791a407f7d465eff17
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Pair.hpp
@@ -0,0 +1,498 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+/// \file Kokkos_Pair.hpp
+/// \brief Declaration and definition of Kokkos::pair.
+///
+/// This header file declares and defines Kokkos::pair and its related
+/// nonmember functions.
+
+#ifndef KOKKOS_PAIR_HPP
+#define KOKKOS_PAIR_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <utility>
+
+namespace Kokkos {
+/// \struct pair
+/// \brief Replacement for std::pair that works on CUDA devices.
+///
+/// The instance methods of std::pair, including its constructors, are
+/// not marked as <tt>__device__</tt> functions.  Thus, they cannot be
+/// called on a CUDA device, such as an NVIDIA GPU.  This struct
+/// implements the same interface as std::pair, but can be used on a
+/// CUDA device as well as on the host.
+template <class T1, class T2>
+struct pair
+{
+  //! The first template parameter of this class.
+  typedef T1 first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Default constructor.
+  ///
+  /// This calls the default constructors of T1 and T2.  It won't
+  /// compile if those default constructors are not defined and
+  /// public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair()
+    : first(), second()
+  {}
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type const& f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, T2> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1& first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1, T2&>
+{
+  //! The first template parameter of this class.
+  typedef T1  first_type;
+  //! The second template parameter of this class.
+  typedef T2& second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type const& f, second_type s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+template <class T1, class T2>
+struct pair<T1&, T2>
+{
+  //! The first template parameter of this class.
+  typedef T1&  first_type;
+  //! The second template parameter of this class.
+  typedef T2 second_type;
+
+  //! The first element of the pair.
+  first_type  first;
+  //! The second element of the pair.
+  second_type second;
+
+  /// \brief Constructor that takes both elements of the pair.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(first_type f, second_type const& s)
+    : first(f), second(s)
+  {}
+
+  /// \brief Copy constructor.
+  ///
+  /// This calls the copy constructors of T1 and T2.  It won't compile
+  /// if those copy constructors are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  // from std::pair<U,V>
+  template <class U, class V>
+  pair( const std::pair<U,V> &p)
+    : first(p.first), second(p.second)
+  {}
+
+  /// \brief Assignment operator.
+  ///
+  /// This calls the assignment operators of T1 and T2.  It won't
+  /// compile if the assignment operators are not defined and public.
+  template <class U, class V>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<first_type, second_type> & operator=(const pair<U,V> &p)
+  {
+    first = p.first;
+    second = p.second;
+    return *this;
+  }
+
+  /// \brief Return the std::pair version of this object.
+  ///
+  /// This is <i>not</i> a device function; you may not call it on a
+  /// CUDA device.  It is meant to be called on the host, if the user
+  /// wants an std::pair instead of a Kokkos::pair.
+  ///
+  /// \note This is not a conversion operator, since defining a
+  ///   conversion operator made the relational operators have
+  ///   ambiguous definitions.
+  std::pair<T1,T2> to_std_pair() const
+  { return std::make_pair(first,second); }
+};
+
+//! Equality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first==rhs.first && lhs.second==rhs.second; }
+
+//! Inequality operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs==rhs); }
+
+//! Less-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
+
+//! Less-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(rhs<lhs); }
+
+//! Greater-than operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>  (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return rhs<lhs; }
+
+//! Greater-than-or-equal-to operator for Kokkos::pair.
+template <class T1, class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
+{ return !(lhs<rhs); }
+
+/// \brief Return a new pair.
+///
+/// This is a "nonmember constructor" for Kokkos::pair.  It works just
+/// like std::make_pair.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1,T2> make_pair (T1 x, T2 y)
+{ return ( pair<T1,T2>(x,y) ); }
+
+/// \brief Return a pair of references to the input arguments.
+///
+/// This compares to std::tie (new in C++11).  You can use it to
+/// assign to two variables at once, from the result of a function
+/// that returns a pair.  For example (<tt>__device__</tt> and
+/// <tt>__host__</tt> attributes omitted for brevity):
+/// \code
+/// // Declaration of the function to call.
+/// // First return value: operation count.
+/// // Second return value: whether all operations succeeded.
+/// Kokkos::pair<int, bool> someFunction ();
+///
+/// // Code that uses Kokkos::tie.
+/// int myFunction () {
+///   int count = 0;
+///   bool success = false;
+///
+///   // This assigns to both count and success.
+///   Kokkos::tie (count, success) = someFunction ();
+///
+///   if (! success) {
+///     // ... Some operation failed;
+///     //     take corrective action ...
+///   }
+///   return count;
+/// }
+/// \endcode
+///
+/// The line that uses tie() could have been written like this:
+/// \code
+///   Kokkos::pair<int, bool> result = someFunction ();
+///   count = result.first;
+///   success = result.second;
+/// \endcode
+///
+/// Using tie() saves two lines of code and avoids a copy of each
+/// element of the pair.  The latter could be significant if one or
+/// both elements of the pair are more substantial objects than \c int
+/// or \c bool.
+template <class T1,class T2>
+KOKKOS_FORCEINLINE_FUNCTION
+pair<T1 &,T2 &> tie (T1 & x, T2 & y)
+{ return ( pair<T1 &,T2 &>(x,y) ); }
+
+//
+// Specialization of Kokkos::pair for a \c void second argument.  This
+// is not actually a "pair"; it only contains one element, the first.
+//
+template <class T1>
+struct pair<T1,void>
+{
+  typedef T1 first_type;
+  typedef void second_type;
+
+  first_type  first;
+  enum { second = 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair()
+    : first()
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(const first_type & f)
+    : first(f)
+  {}
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair(const first_type & f, int)
+    : first(f)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair( const pair<U,void> &p)
+    : first(p.first)
+  {}
+
+  template <class U>
+  KOKKOS_FORCEINLINE_FUNCTION
+  pair<T1, void> & operator=(const pair<U,void> &p)
+  {
+    first = p.first;
+    return *this;
+  }
+};
+
+//
+// Specialization of relational operators for Kokkos::pair<T1,void>.
+//
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first==rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs==rhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return lhs.first<rhs.first; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(rhs<lhs); }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>  (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return rhs<lhs; }
+
+template <class T1>
+KOKKOS_FORCEINLINE_FUNCTION
+bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
+{ return !(lhs<rhs); }
+
+} // namespace Kokkos
+
+
+#endif //KOKKOS_PAIR_HPP
diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..d714485e70d7726eef027e7c56c3722e65881582
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp
@@ -0,0 +1,908 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Parallel.hpp
+/// \brief Declaration of parallel operators
+
+#ifndef KOKKOS_PARALLEL_HPP
+#define KOKKOS_PARALLEL_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_View.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+#include <impl/Kokkos_Profiling_Interface.hpp>
+#include <typeinfo>
+#endif
+
+#include <impl/Kokkos_AllocationTracker.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#ifdef KOKKOS_HAVE_DEBUG
+#include<iostream>
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Given a Functor and Execution Policy query an execution space.
+ *
+ *  if       the Policy has an execution space use that
+ *  else if  the Functor has an execution_space use that
+ *  else if  the Functor has a device_type use that for backward compatibility
+ *  else     use the default
+ */
+template< class Functor
+        , class Policy
+        , class EnableFunctor = void
+        , class EnablePolicy  = void
+        >
+struct FunctorPolicyExecutionSpace {
+  typedef Kokkos::DefaultExecutionSpace execution_space ;
+};
+
+template< class Functor , class Policy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::device_type     >::type
+  , typename enable_if_type< typename Policy ::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::execution_space >::type
+  , typename enable_if_type< typename Policy ::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy , class EnableFunctor >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , EnableFunctor
+  , typename enable_if_type< typename Policy::execution_space >::type
+  >
+{
+  typedef typename Policy ::execution_space execution_space ;
+};
+
+template< class Functor , class Policy , class EnablePolicy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::device_type >::type
+  , EnablePolicy
+  >
+{
+  typedef typename Functor::device_type execution_space ;
+};
+
+template< class Functor , class Policy , class EnablePolicy >
+struct FunctorPolicyExecutionSpace
+  < Functor , Policy
+  , typename enable_if_type< typename Functor::execution_space >::type
+  , EnablePolicy
+  >
+{
+  typedef typename Functor::execution_space execution_space ;
+};
+
+//----------------------------------------------------------------------------
+/// \class ParallelFor
+/// \brief Implementation of the ParallelFor operator that has a
+///   partial specialization for the device.
+///
+/// This is an implementation detail of parallel_for.  Users should
+/// skip this and go directly to the nonmember function parallel_for.
+template< class FunctorType , class ExecPolicy > class ParallelFor ;
+
+/// \class ParallelReduce
+/// \brief Implementation detail of parallel_reduce.
+///
+/// This is an implementation detail of parallel_reduce.  Users should
+/// skip this and go directly to the nonmember function parallel_reduce.
+template< class FunctorType , class ExecPolicy > class ParallelReduce ;
+
+/// \class ParallelScan
+/// \brief Implementation detail of parallel_scan.
+///
+/// This is an implementation detail of parallel_scan.  Users should
+/// skip this and go directly to the documentation of the nonmember
+/// template function Kokkos::parallel_scan.
+template< class FunctorType , class ExecPolicy > class ParallelScan ;
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief Execute \c functor in parallel according to the execution \c policy.
+ *
+ * A "functor" is a class containing the function to execute in parallel,
+ * data needed for that execution, and an optional \c execution_space
+ * typedef.  Here is an example functor for parallel_for:
+ *
+ * \code
+ *  class FunctorType {
+ *  public:
+ *    typedef  ...  execution_space ;
+ *    void operator() ( WorkType iwork ) const ;
+ *  };
+ * \endcode
+ *
+ * In the above example, \c WorkType is any integer type for which a
+ * valid conversion from \c size_t to \c IntType exists.  Its
+ * <tt>operator()</tt> method defines the operation to parallelize,
+ * over the range of integer indices <tt>iwork=[0,work_count-1]</tt>.
+ * This compares to a single iteration \c iwork of a \c for loop.
+ * If \c execution_space is not defined DefaultExecutionSpace will be used.
+ */
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_for( const ExecPolicy  & policy
+                 , const FunctorType & functor
+                 , const std::string& str = ""
+                 , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
+                 )
+{
+#ifdef KOKKOSP_ENABLE_PROFILING
+    uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+     	Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+    (void) Impl::ParallelFor< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy );
+   
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+        Kokkos::Experimental::endParallelFor(kpID);
+     }
+#endif
+}
+
+template< class FunctorType >
+inline
+void parallel_for( const size_t        work_count
+                 , const FunctorType & functor
+                 , const std::string& str = ""
+                 )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+  typedef RangePolicy< execution_space > policy ;
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+  	Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelFor< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelFor(kpID);
+     }
+#endif
+}
+
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_for( const std::string & str
+                 , const ExecPolicy  & policy
+                 , const FunctorType & functor )
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl;
+  #endif
+
+  parallel_for(policy,functor,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_for kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+/** \brief  Parallel reduction
+ *
+ * Example of a parallel_reduce functor for a POD (plain old data) value type:
+ * \code
+ *  class FunctorType { // For POD value type
+ *  public:
+ *    typedef    ...     execution_space ;
+ *    typedef <podType>  value_type ;
+ *    void operator()( <intType> iwork , <podType> & update ) const ;
+ *    void init( <podType> & update ) const ;
+ *    void join( volatile       <podType> & update ,
+ *               volatile const <podType> & input ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> & update ) const ;
+ *  };
+ * \endcode
+ *
+ * Example of a parallel_reduce functor for an array of POD (plain old data) values:
+ * \code
+ *  class FunctorType { // For array of POD value
+ *  public:
+ *    typedef    ...     execution_space ;
+ *    typedef <podType>  value_type[] ;
+ *    void operator()( <intType> , <podType> update[] ) const ;
+ *    void init( <podType> update[] ) const ;
+ *    void join( volatile       <podType> update[] ,
+ *               volatile const <podType> input[] ) const ;
+ *
+ *    typedef true_type has_final ;
+ *    void final( <podType> update[] ) const ;
+ *  };
+ * \endcode
+ */
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_reduce( const ExecPolicy  & policy
+                    , const FunctorType & functor
+                    , const std::string& str = ""
+                    , typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
+                    )
+{
+  // typedef typename
+  //   Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
+  //     execution_space ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view ;
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+  	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+     (void) Impl::ParallelReduce< FunctorType , ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , result_view );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+}
+
+// integral range policy
+template< class FunctorType >
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorType & functor
+                    , const std::string& str = ""
+                    )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef RangePolicy< execution_space > policy ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view ;
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+  	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , result_view );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+
+}
+
+// general policy and view ouput
+template< class ExecPolicy , class FunctorType , class ViewType >
+inline
+void parallel_reduce( const ExecPolicy  & policy
+                    , const FunctorType & functor
+                    , const ViewType    & result_view
+                    , const std::string& str = ""
+                    , typename Impl::enable_if<
+                      ( Impl::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
+#ifdef KOKKOS_HAVE_CUDA
+                        && ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
+#endif
+                      )>::type * = 0 )
+{
+    
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+
+}
+
+// general policy and pod or array of pod output
+template< class ExecPolicy , class FunctorType >
+void parallel_reduce( const ExecPolicy  & policy
+                    , const FunctorType & functor
+#ifdef KOKKOS_HAVE_CUDA
+                    , typename Impl::enable_if<
+                      ( ! Impl::is_integral< ExecPolicy >::value &&
+                        ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )
+                      , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref
+                      , const std::string& str = ""
+                      , typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0
+                      )
+#else
+                      , typename Impl::enable_if<
+                        ( ! Impl::is_integral< ExecPolicy >::value)
+                        , typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type
+                        >::type result_ref
+                      , const std::string& str = ""
+                        )
+#endif
+{
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , typename ExecPolicy::work_tag >  ValueOps ;
+
+  // Wrap the result output request in a view to inform the implementation
+  // of the type and memory space.
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view( ValueOps::pointer( result_ref )
+               , ValueTraits::value_count( functor )
+               );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , policy , Impl::CopyWithoutTracking::apply(result_view) );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+
+}
+
+// integral range policy and view ouput
+template< class FunctorType , class ViewType >
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorType & functor
+                    , const ViewType    & result_view
+                    , const std::string& str = ""
+                    , typename Impl::enable_if<( Impl::is_view<ViewType>::value
+#ifdef KOKKOS_HAVE_CUDA
+                        && ! Impl::is_same<
+                          typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
+                          Kokkos::Cuda>::value
+#endif
+                        )>::type * = 0 )
+{
+  typedef typename
+    Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef RangePolicy< execution_space > ExecPolicy ;
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+  (void) Impl::ParallelReduce< FunctorType, ExecPolicy >( Impl::CopyWithoutTracking::apply(functor) , ExecPolicy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) );
+    
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+
+}
+
+// integral range policy and pod or array of pod output
+template< class FunctorType >
+inline
+void parallel_reduce( const size_t        work_count
+                    , const FunctorType & functor
+                    , typename Kokkos::Impl::FunctorValueTraits<
+                         typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value ||
+                                             Impl::is_integral<FunctorType>::value,
+                            void,FunctorType>::type
+                         , void >::reference_type result
+                    , const std::string& str = ""
+                    , typename Impl::enable_if< true
+#ifdef KOKKOS_HAVE_CUDA
+                              && ! Impl::is_same<
+                             typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
+                             Kokkos::Cuda>::value
+#endif
+                     >::type * = 0 )
+{
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , void >  ValueOps ;
+
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy< execution_space > policy ;
+
+  // Wrap the result output request in a view to inform the implementation
+  // of the type and memory space.
+
+  typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
+                                     , typename ValueTraits::value_type
+                                     , typename ValueTraits::pointer_type
+                                     >::type value_type ;
+
+  Kokkos::View< value_type
+              , HostSpace
+              , Kokkos::MemoryUnmanaged
+              >
+    result_view( ValueOps::pointer( result )
+               , ValueTraits::value_count( functor )
+               );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+  (void) Impl::ParallelReduce< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) , Impl::CopyWithoutTracking::apply(result_view) );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelReduce(kpID);
+     }
+#endif
+
+}
+
+template< class ExecPolicy , class FunctorType , class ResultType >
+inline
+void parallel_reduce( const std::string & str
+                    , const ExecPolicy  & policy
+                    , const FunctorType & functor
+                    , ResultType * result)
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
+  #endif
+
+  parallel_reduce(policy,functor,result,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+template< class ExecPolicy , class FunctorType , class ResultType >
+inline
+void parallel_reduce( const std::string & str
+                    , const ExecPolicy  & policy
+                    , const FunctorType & functor
+                    , ResultType & result)
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
+  #endif
+
+  parallel_reduce(policy,functor,result,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+template< class ExecPolicy , class FunctorType >
+inline
+void parallel_reduce( const std::string & str
+                    , const ExecPolicy  & policy
+                    , const FunctorType & functor)
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
+  #endif
+
+  parallel_reduce(policy,functor,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_reduce kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/// \fn parallel_scan
+/// \tparam ExecutionPolicy The execution policy type.
+/// \tparam FunctorType     The scan functor type.
+///
+/// \param policy  [in] The execution policy.
+/// \param functor [in] The scan functor.
+///
+/// This function implements a parallel scan pattern.  The scan can
+/// be either inclusive or exclusive, depending on how you implement
+/// the scan functor.
+///
+/// A scan functor looks almost exactly like a reduce functor, except
+/// that its operator() takes a third \c bool argument, \c final_pass,
+/// which indicates whether this is the last pass of the scan
+/// operation.  We will show below how to use the \c final_pass
+/// argument to control whether the scan is inclusive or exclusive.
+///
+/// Here is the minimum required interface of a scan functor for a POD
+/// (plain old data) value type \c PodType.  That is, the result is a
+/// View of zero or more PodType.  It is also possible for the result
+/// to be an array of (same-sized) arrays of PodType, but we do not
+/// show the required interface for that here.
+/// \code
+/// template< class ExecPolicy , class FunctorType >
+/// class ScanFunctor {
+/// public:
+///   // The Kokkos device type
+///   typedef ... execution_space;
+///   // Type of an entry of the array containing the result;
+///   // also the type of each of the entries combined using
+///   // operator() or join().
+///   typedef PodType value_type;
+///
+///   void operator () (const ExecPolicy::member_type & i, value_type& update, const bool final_pass) const;
+///   void init (value_type& update) const;
+///   void join (volatile value_type& update, volatile const value_type& input) const
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an inclusive plus-scan
+/// of an array of \c int, in place.  If given an array [1, 2, 3, 4], this
+/// scan will overwrite that array with [1, 3, 6, 10].
+///
+/// \code
+/// template<class SpaceType>
+/// class InclScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   InclScanFunctor( Kokkos::View<value_type*, execution_space> x
+///                  , Kokkos::View<value_type*, execution_space> y ) : m_x(x), m_y(y) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     update += m_x(i);
+///     if (final_pass) {
+///       m_y(i) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> m_x;
+///   Kokkos::View<value_type*, execution_space> m_y;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which computes an <i>exclusive</i>
+/// scan of an array of \c int, in place.  In operator(), note both
+/// that the final_pass test and the update have switched places, and
+/// the use of a temporary.  If given an array [1, 2, 3, 4], this scan
+/// will overwrite that array with [0, 1, 3, 6].
+///
+/// \code
+/// template<class SpaceType>
+/// class ExclScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   ExclScanFunctor (Kokkos::View<value_type*, execution_space> x) : x_ (x) {}
+///
+///   void operator () (const size_type i, value_type& update, const bool final_pass) const {
+///     const value_type x_i = x_(i);
+///     if (final_pass) {
+///       x_(i) = update;
+///     }
+///     update += x_i;
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> x_;
+/// };
+/// \endcode
+///
+/// Here is an example of a functor which builds on the above
+/// exclusive scan example, to compute an offsets array from a
+/// population count array, in place.  We assume that the pop count
+/// array has an extra entry at the end to store the final count.  If
+/// given an array [1, 2, 3, 4, 0], this scan will overwrite that
+/// array with [0, 1, 3, 6, 10].
+///
+/// \code
+/// template<class SpaceType>
+/// class OffsetScanFunctor {
+/// public:
+///   typedef SpaceType execution_space;
+///   typedef int value_type;
+///   typedef typename SpaceType::size_type size_type;
+///
+///   // lastIndex_ is the last valid index (zero-based) of x.
+///   // If x has length zero, then lastIndex_ won't be used anyway.
+///   OffsetScanFunctor( Kokkos::View<value_type*, execution_space> x
+///                    , Kokkos::View<value_type*, execution_space> y )
+///      : m_x(x), m_y(y), last_index_ (x.dimension_0 () == 0 ? 0 : x.dimension_0 () - 1)
+///   {}
+///
+///   void operator () (const size_type i, int& update, const bool final_pass) const {
+///     if (final_pass) {
+///       m_y(i) = update;
+///     }
+///     update += m_x(i);
+///     // The last entry of m_y gets the final sum.
+///     if (final_pass && i == last_index_) {
+///       m_y(i+1) = update;
+///     }
+///   }
+///   void init (value_type& update) const {
+///     update = 0;
+///   }
+///   void join (volatile value_type& update, volatile const value_type& input) const {
+///     update += input;
+///   }
+///
+/// private:
+///   Kokkos::View<value_type*, execution_space> m_x;
+///   Kokkos::View<value_type*, execution_space> m_y;
+///   const size_type last_index_;
+/// };
+/// \endcode
+///
+template< class ExecutionPolicy , class FunctorType >
+inline
+void parallel_scan( const ExecutionPolicy & policy
+                  , const FunctorType     & functor
+                  , const std::string& str = ""
+                  , typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
+                  )
+{
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+
+  Impl::ParallelScan< FunctorType , ExecutionPolicy > scan( Impl::CopyWithoutTracking::apply(functor) , policy );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelScan(kpID);
+     }
+#endif
+
+}
+
+template< class FunctorType >
+inline
+void parallel_scan( const size_t        work_count
+                  , const FunctorType & functor
+                  , const std::string& str = "" )
+{
+  typedef typename
+    Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
+      execution_space ;
+
+  typedef Kokkos::RangePolicy< execution_space > policy ;
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+  uint64_t kpID = 0;
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
+     }
+#endif
+    
+  (void) Impl::ParallelScan< FunctorType , policy >( Impl::CopyWithoutTracking::apply(functor) , policy(0,work_count) );
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+     if(Kokkos::Experimental::profileLibraryLoaded()) {
+	Kokkos::Experimental::endParallelScan(kpID);
+     }
+#endif
+
+}
+
+template< class ExecutionPolicy , class FunctorType >
+inline
+void parallel_scan( const std::string& str
+                  , const ExecutionPolicy & policy
+                  , const FunctorType     & functor)
+{
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl;
+  #endif
+
+  parallel_scan(policy,functor,str);
+
+  #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
+  Kokkos::fence();
+  std::cout << "KOKKOS_DEBUG End   parallel_scan kernel: " << str << std::endl;
+  #endif
+  (void) str;
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Enable = void >
+struct FunctorTeamShmemSize
+{
+  static inline size_t value( const FunctorType & , int ) { return 0 ; }
+};
+
+template< class FunctorType >
+struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::team_shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f , int team_size ) { return f.team_shmem_size( team_size ) ; }
+};
+
+template< class FunctorType >
+struct FunctorTeamShmemSize< FunctorType , typename Impl::enable_if< 0 < sizeof( & FunctorType::shmem_size ) >::type >
+{
+  static inline size_t value( const FunctorType & f , int team_size ) { return f.shmem_size( team_size ) ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Qthread.hpp b/lib/kokkos/core/src/Kokkos_Qthread.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..4f12c02ba0096b57a34ffef6a945d567db33e83c
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Qthread.hpp
@@ -0,0 +1,165 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREAD_HPP
+#define KOKKOS_QTHREAD_HPP
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ExecPolicy.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class QthreadExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Execution space supported by Qthread */
+class Qthread {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as an execution space
+  typedef Qthread                  execution_space ;
+  typedef Kokkos::HostSpace        memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef Kokkos::LayoutRight      array_layout ;
+  typedef memory_space::size_type  size_type ;
+
+  typedef ScratchMemorySpace< Qthread > scratch_memory_space ;
+
+  //@}
+  /*------------------------------------------------------------------------*/
+
+  /** \brief  Initialization will construct one or more instances */
+  static Qthread & instance( int = 0 );
+
+  /** \brief  Set the execution space to a "sleep" state.
+   *
+   * This function sets the "sleep" state in which it is not ready for work.
+   * This may consume less resources than in an "ready" state,
+   * but it may also take time to transition to the "ready" state.
+   *
+   * \return True if enters or is in the "sleep" state.
+   *         False if functions are currently executing.
+   */
+  bool sleep();
+
+  /** \brief  Wake from the sleep state.
+   * 
+   *  \return True if enters or is in the "ready" state.
+   *          False if functions are currently executing.
+   */
+  static bool wake();
+
+  /** \brief Wait until all dispatched functions to complete.
+   * 
+   *  The parallel_for or parallel_reduce dispatch of a functor may
+   *  return asynchronously, before the functor completes.  This
+   *  method does not return until all dispatched functors on this
+   *  device have completed.
+   */
+  static void fence();
+
+  /*------------------------------------------------------------------------*/
+
+  static void initialize( int thread_count );
+  static void finalize();
+
+  /** \brief Print configuration information to the given output stream. */
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  int shepherd_size() const ;
+  int shepherd_worker_size() const ;
+};
+
+/*--------------------------------------------------------------------------*/
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Qthread::memory_space
+  , Kokkos::Qthread::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#include <Kokkos_Parallel.hpp>
+#include <Qthread/Kokkos_QthreadExec.hpp>
+#include <Qthread/Kokkos_Qthread_Parallel.hpp>
+
+#endif /* #define KOKKOS_QTHREAD_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..6e5b4f96242b0f9af803a71643182528017271ae
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SCRATCHSPACE_HPP
+#define KOKKOS_SCRATCHSPACE_HPP
+
+#include <stdio.h>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Scratch memory space associated with an execution space.
+ *
+ */
+template< class ExecSpace >
+class ScratchMemorySpace {
+public:
+
+  // Alignment of memory chunks returned by 'get'
+  // must be a power of two
+  enum { ALIGN = 8 };
+
+private:
+
+  mutable char * m_iter ;
+  char *         m_end ;
+
+  ScratchMemorySpace();
+  ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
+
+  enum { MASK = ALIGN - 1 }; // Alignment used by View::shmem_size
+
+public:
+
+  //! Tag this class as a memory space
+  typedef ScratchMemorySpace                memory_space ;
+  typedef ExecSpace                         execution_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef typename ExecSpace::array_layout  array_layout ;
+  typedef typename ExecSpace::size_type     size_type ;
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION static
+  IntType align( const IntType & size )
+    { return ( size + MASK ) & ~MASK ; }
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION
+  void* get_shmem (const IntType& size) const {
+    void* tmp = m_iter ;
+    if (m_end < (m_iter += align (size))) {
+      m_iter -= align (size); // put it back like it was
+  #ifdef KOKKOS_HAVE_DEBUG
+      // mfh 23 Jun 2015: printf call consumes 25 registers
+      // in a CUDA build, so only print in debug mode.  The
+      // function still returns NULL if not enough memory.
+      printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
+              "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
+              long(m_end-m_iter));
+  #endif // KOKKOS_HAVE_DEBUG
+      tmp = 0;
+    }
+    return tmp;
+  }
+
+  template< typename IntType >
+  KOKKOS_INLINE_FUNCTION
+  ScratchMemorySpace( void * ptr , const IntType & size )
+    : m_iter( (char *) ptr )
+    , m_end(  m_iter + size )
+    {}
+};
+
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_SCRATCHSPACE_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..5773a18b3f4c9288070be0f2a6e398d714b68ee3
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Serial.hpp
@@ -0,0 +1,892 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Serial.hpp
+/// \brief Declaration and definition of Kokkos::Serial device.
+
+#ifndef KOKKOS_SERIAL_HPP
+#define KOKKOS_SERIAL_HPP
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_Parallel.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL )
+
+namespace Kokkos {
+
+/// \class Serial
+/// \brief Kokkos device for non-parallel execution
+///
+/// A "device" represents a parallel execution model.  It tells Kokkos
+/// how to parallelize the execution of kernels in a parallel_for or
+/// parallel_reduce.  For example, the Threads device uses Pthreads or
+/// C++11 threads on a CPU, the OpenMP device uses the OpenMP language
+/// extensions, and the Cuda device uses NVIDIA's CUDA programming
+/// model.  The Serial device executes "parallel" kernels
+/// sequentially.  This is useful if you really do not want to use
+/// threads, or if you want to explore different combinations of MPI
+/// and shared-memory parallel programming models.
+class Serial {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+
+  //! Tag this class as an execution space:
+  typedef Serial                execution_space ;
+  //! The size_type typedef best suited for this device.
+  typedef HostSpace::size_type  size_type ;
+  //! This device's preferred memory space.
+  typedef HostSpace             memory_space ;
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  //! This device's preferred array layout.
+  typedef LayoutRight           array_layout ;
+
+  /// \brief  Scratch memory space
+  typedef ScratchMemorySpace< Kokkos::Serial >  scratch_memory_space ;
+
+  //@}
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  ///
+  /// For the Serial device, this method <i>always</i> returns false,
+  /// because parallel_for or parallel_reduce with the Serial device
+  /// always execute sequentially.
+  inline static int in_parallel() { return false ; }
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence() {}
+
+  static void initialize( unsigned threads_count = 1 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false) {
+    (void) threads_count;
+    (void) use_numa_count;
+    (void) use_cores_per_numa;
+    (void) allow_asynchronous_threadpool;
+
+    // Init the array of locks used for arbitrarily sized atomics
+    Impl::init_lock_array_host_space();
+
+  }
+
+  static int is_initialized() { return 1 ; }
+
+  //! Free any resources being consumed by the device.
+  static void finalize() {}
+
+  //! Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false ) {}
+
+  //--------------------------------------------------------------------------
+
+  inline static int thread_pool_size( int = 0 ) { return 1 ; }
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
+
+  //--------------------------------------------------------------------------
+
+  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+
+  //--------------------------------------------------------------------------
+
+  static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
+
+  //--------------------------------------------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Serial::memory_space
+  , Kokkos::Serial::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+namespace SerialImpl {
+
+struct Sentinel {
+
+  void *   m_scratch ;
+  unsigned m_reduce_end ;
+  unsigned m_shared_end ;
+
+  Sentinel();
+  ~Sentinel();
+  static Sentinel & singleton();
+};
+
+inline
+unsigned align( unsigned n );
+}
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+class SerialTeamMember {
+private:
+  typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
+  const scratch_memory_space  m_space ;
+  const int                   m_league_rank ;
+  const int                   m_league_size ;
+
+  SerialTeamMember & operator = ( const SerialTeamMember & );
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_space ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const {}
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(const ValueType& , const int& ) const {}
+
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION
+  ValueType team_reduce( const ValueType & value , const JoinOp & ) const
+    {
+      return value ;
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
+    {
+      const Type tmp = global_accum ? *global_accum : Type(0) ;
+      if ( global_accum ) { *global_accum += value ; }
+      return tmp ;
+    }
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
+    { return Type(0); }
+
+  //----------------------------------------
+  // Execution space specific:
+
+  SerialTeamMember( int arg_league_rank
+                  , int arg_league_size
+                  , int arg_shared_size
+                  );
+};
+
+} // namespace Impl
+
+
+/*
+ * < Kokkos::Serial , WorkArgTag >
+ * < WorkArgTag , Impl::enable_if< Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
+ *
+ */
+template< class Arg0 , class Arg1 >
+class TeamPolicy< Arg0 , Arg1 , Kokkos::Serial >
+{
+private:
+
+  const int m_league_size ;
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicy      execution_policy ;
+
+  //! Execution space of this execution policy:
+  typedef Kokkos::Serial  execution_space ;
+
+  typedef typename
+    Impl::if_c< ! Impl::is_same< Kokkos::Serial , Arg0 >::value , Arg0 , Arg1 >::type
+      work_tag ;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  static
+  int team_size_max( const FunctorType & ) { return 1 ; }
+
+  template< class FunctorType >
+  static
+  int team_size_recommended( const FunctorType & ) { return 1 ; }
+
+  template< class FunctorType >
+  static
+  int team_size_recommended( const FunctorType & , const int& ) { return 1 ; }
+
+  //----------------------------------------
+
+  inline int team_size() const { return 1 ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size_request , int /* team_size_request */  , int vector_length_request = 1 )
+    : m_league_size( league_size_request )
+    { (void) vector_length_request; }
+
+  TeamPolicy( int league_size_request , int /* team_size_request */ , int vector_length_request = 1 )
+    : m_league_size( league_size_request )
+    { (void) vector_length_request; }
+
+  typedef Impl::SerialTeamMember  member_type ;
+};
+
+} /* namespace Kokkos */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
+
+public:
+  // work tag is void
+  template< class PType >
+  inline
+  ParallelFor( typename Impl::enable_if<
+                 ( Impl::is_same< PType , Policy >::value &&
+                   Impl::is_same< typename PType::work_tag , void >::value
+                 ), const FunctorType & >::type functor
+             , const PType & policy )
+    {
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( i );
+      }
+    }
+
+  // work tag is non-void
+  template< class PType >
+  inline
+  ParallelFor( typename Impl::enable_if<
+                 ( Impl::is_same< PType , Policy >::value &&
+                   ! Impl::is_same< typename PType::work_tag , void >::value
+                 ), const FunctorType & >::type functor
+             , const PType & policy )
+    {
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i );
+      }
+    }
+};
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
+{
+public:
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  // Work tag is void
+  template< class ViewType , class PType >
+  ParallelReduce( typename Impl::enable_if<
+                    ( Impl::is_view< ViewType >::value &&
+                      Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
+                      Impl::is_same< PType , Policy >::value &&
+                      Impl::is_same< typename PType::work_tag , void >::value
+                    ), const FunctorType & >::type functor
+                , const PType     & policy
+                , const ViewType  & result
+                )
+    {
+      pointer_type result_ptr = result.ptr_on_device();
+
+      if ( ! result_ptr ) {
+        result_ptr = (pointer_type)
+          Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
+      }
+
+      reference_type update = ValueInit::init( functor , result_ptr );
+
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( i , update );
+      }
+
+      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
+    }
+
+  // Work tag is non-void
+  template< class ViewType , class PType >
+  ParallelReduce( typename Impl::enable_if<
+                    ( Impl::is_view< ViewType >::value &&
+                      Impl::is_same< typename ViewType::memory_space , HostSpace >::value &&
+                      Impl::is_same< PType , Policy >::value &&
+                      ! Impl::is_same< typename PType::work_tag , void >::value
+                    ), const FunctorType & >::type functor
+                , const PType     & policy
+                , const ViewType  & result
+                )
+    {
+      pointer_type result_ptr = result.ptr_on_device();
+
+      if ( ! result_ptr ) {
+        result_ptr = (pointer_type)
+          Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
+      }
+
+      typename ValueTraits::reference_type update = ValueInit::init( functor , result_ptr );
+
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update );
+      }
+
+      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , result_ptr );
+    }
+};
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial > Policy ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  // work tag is void
+  template< class PType >
+  inline
+  ParallelScan( typename Impl::enable_if<
+                 ( Impl::is_same< PType , Policy >::value &&
+                   Impl::is_same< typename PType::work_tag , void >::value
+                 ), const FunctorType & >::type functor
+             , const PType & policy )
+    {
+      pointer_type result_ptr = (pointer_type)
+        Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
+
+      reference_type update = ValueInit::init( functor , result_ptr );
+
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( i , update , true );
+      }
+
+      Kokkos::Impl::FunctorFinal<  FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
+    }
+
+  // work tag is non-void
+  template< class PType >
+  inline
+  ParallelScan( typename Impl::enable_if<
+                 ( Impl::is_same< PType , Policy >::value &&
+                   ! Impl::is_same< typename PType::work_tag , void >::value
+                 ), const FunctorType & >::type functor
+             , const PType & policy )
+    {
+      pointer_type result_ptr = (pointer_type)
+        Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( functor ) , 0 );
+
+      reference_type update = ValueInit::init( functor , result_ptr );
+
+      const typename PType::member_type e = policy.end();
+      for ( typename PType::member_type i = policy.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update , true );
+      }
+
+      Kokkos::Impl::FunctorFinal<  FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
+{
+private:
+
+  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type & member )
+    { functor( member ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type & member )
+    { functor( TagType() , member ); }
+
+public:
+
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy )
+    {
+      const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+      Kokkos::Serial::scratch_memory_resize( 0 , shared_size );
+
+      for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
+        ParallelFor::template driver< typename Policy::work_tag >
+          ( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) );
+        // functor( typename Policy::member_type(ileague,policy.league_size(),shared_size) );
+      }
+    }
+};
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > >
+{
+private:
+
+  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Serial > Policy ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag >  ValueInit ;
+
+public:
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+private:
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type  & member
+             ,       reference_type                  update )
+    { functor( member , update ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type  & member
+             ,       reference_type                  update )
+    { functor( TagType() , member , update ); }
+
+public:
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType  & functor
+                , const Policy       & policy
+                , const ViewType     & result
+                )
+    {
+      const int reduce_size = ValueTraits::value_size( functor );
+      const int shared_size = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+      void * const scratch_reduce = Kokkos::Serial::scratch_memory_resize( reduce_size , shared_size );
+
+      const pointer_type result_ptr =
+        result.ptr_on_device() ? result.ptr_on_device()
+                               : (pointer_type) scratch_reduce ;
+
+      reference_type update = ValueInit::init( functor , result_ptr );
+
+      for ( int ileague = 0 ; ileague < policy.league_size() ; ++ileague ) {
+        ParallelReduce::template driver< typename Policy::work_tag >
+          ( functor , typename Policy::member_type(ileague,policy.league_size(),shared_size) , update );
+      }
+
+      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( functor , result_ptr );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+template<typename iType>
+struct TeamThreadRangeBoundariesStruct<iType,SerialTeamMember> {
+  typedef iType index_type;
+  const iType begin ;
+  const iType end ;
+  enum {increment = 1};
+  const SerialTeamMember& thread;
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_count)
+    : begin(0)
+    , end(arg_count)
+    , thread(arg_thread)
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_begin, const iType & arg_end )
+    : begin( arg_begin )
+    , end(   arg_end)
+    , thread( arg_thread )
+    {}
+};
+
+  template<typename iType>
+  struct ThreadVectorRangeBoundariesStruct<iType,SerialTeamMember> {
+    typedef iType index_type;
+    enum {start = 0};
+    const iType end;
+    enum {increment = 1};
+
+    KOKKOS_INLINE_FUNCTION
+    ThreadVectorRangeBoundariesStruct (const SerialTeamMember& thread, const iType& count):
+      end( count )
+    {}
+  };
+
+} // namespace Impl
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
+TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
+TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & begin , const iType & end )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >
+  ThreadVectorRange(const Impl::SerialTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread);
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+#ifdef KOKKOS_HAVE_CXX11
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+#endif // KOKKOS_HAVE_CXX11
+
+} //namespace Kokkos
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+}
+
+#endif // defined( KOKKOS_HAVE_SERIAL )
+#endif /* #define KOKKOS_SERIAL_HPP */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..6f6453fd46f1e90cc8ee5f6edd119f45843078d4
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_TaskPolicy.hpp
@@ -0,0 +1,376 @@
+
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_TASKPOLICY_HPP
+#define KOKKOS_TASKPOLICY_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct FutureValueTypeIsVoidError {};
+
+template < class ExecSpace , class ResultType , class FunctorType >
+class TaskMember ;
+
+template< class ExecPolicy , class ResultType , class FunctorType >
+class TaskForEach ;
+
+template< class ExecPolicy , class ResultType , class FunctorType >
+class TaskReduce ;
+
+template< class ExecPolicy , class ResultType , class FunctorType >
+struct TaskScan ;
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/**\brief  States of a task */
+enum TaskState
+  { TASK_STATE_NULL         = 0  ///<  Does not exist
+  , TASK_STATE_CONSTRUCTING = 1  ///<  Is under construction
+  , TASK_STATE_WAITING      = 2  ///<  Is waiting for execution
+  , TASK_STATE_EXECUTING    = 4  ///<  Is executing
+  , TASK_STATE_COMPLETE     = 8  ///<  Execution is complete
+  };
+
+/**
+ *
+ *  Future< space >  // value_type == void
+ *  Future< value >  // space == Default
+ *  Future< value , space >
+ *
+ */
+template< class Arg1 = void , class Arg2 = void >
+class Future {
+private:
+
+  template< class , class , class > friend class Impl::TaskMember ;
+  template< class > friend class TaskPolicy ;
+  template< class , class > friend class Future ;
+
+  // Argument #2, if not void, must be the space.
+  enum { Arg1_is_space  = Kokkos::Impl::is_execution_space< Arg1 >::value };
+  enum { Arg2_is_space  = Kokkos::Impl::is_execution_space< Arg2 >::value };
+  enum { Arg2_is_void   = Kokkos::Impl::is_same< Arg2 , void >::value };
+
+  struct ErrorNoExecutionSpace {};
+
+  enum { Opt1  =   Arg1_is_space && Arg2_is_void
+       , Opt2  = ! Arg1_is_space && Arg2_is_void
+       , Opt3  = ! Arg1_is_space && Arg2_is_space
+       , OptOK = Kokkos::Impl::StaticAssert< Opt1 || Opt2 || Opt3 , ErrorNoExecutionSpace >::value
+       };
+
+  typedef typename
+    Kokkos::Impl::if_c< Opt2 || Opt3 , Arg1 , void >::type
+      ValueType ;
+
+  typedef typename
+    Kokkos::Impl::if_c< Opt1 , Arg1 , typename
+    Kokkos::Impl::if_c< Opt2 , Kokkos::DefaultExecutionSpace , typename
+    Kokkos::Impl::if_c< Opt3 , Arg2 , void
+    >::type >::type >::type
+      ExecutionSpace ;
+
+  typedef Impl::TaskMember< ExecutionSpace , void , void >       TaskRoot ;
+  typedef Impl::TaskMember< ExecutionSpace , ValueType , void >  TaskValue ;
+
+  TaskRoot * m_task ;
+
+public:
+
+  typedef ValueType       value_type;
+  typedef ExecutionSpace  execution_space ;
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  TaskState get_task_state() const
+    { return 0 != m_task ? m_task->get_state() : TASK_STATE_NULL ; }
+
+  //----------------------------------------
+
+  explicit
+  Future( TaskRoot * task )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( task ) ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  ~Future() { TaskRoot::assign( & m_task , 0 ); }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION
+  Future() : m_task(0) {}
+
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future & rhs )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , rhs.m_task ); }
+
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future & rhs )
+    { TaskRoot::assign( & m_task , rhs.m_task ); return *this ; }
+
+  //----------------------------------------
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future( const Future<A1,A2> & rhs )
+    : m_task(0)
+    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); }
+
+  template< class A1 , class A2 >
+  KOKKOS_INLINE_FUNCTION
+  Future & operator = ( const Future<A1,A2> & rhs )
+    { TaskRoot::assign( & m_task , TaskRoot::template verify_type< value_type >( rhs.m_task ) ); return *this ; }
+
+  //----------------------------------------
+
+  typedef typename TaskValue::get_result_type get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const
+    { return static_cast<TaskValue*>( m_task )->get(); }
+};
+
+namespace Impl {
+
+template< class T >
+struct is_future : public Kokkos::Impl::bool_< false > {};
+
+template< class Arg0 , class Arg1 >
+struct is_future< Kokkos::Experimental::Future<Arg0,Arg1> > : public Kokkos::Impl::bool_< true > {};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+/** \brief  If the argument is an execution space then a serial task in that space */
+template< class Arg0 = Kokkos::DefaultExecutionSpace >
+class TaskPolicy {
+public:
+
+  typedef typename Arg0::execution_space  execution_space ;
+
+  //----------------------------------------
+  /** \brief  Create a serial task with storage for dependences.
+   *
+   *  Postcondition: Task is in the 'constructing' state.
+   */
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create( const FunctorType & functor
+        , const unsigned      dependence_capacity /* = default */ ) const ;
+
+  /** \brief  Create a foreach task with storage for dependences. */
+  template< class ExecPolicy , class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create_foreach( const ExecPolicy  & policy
+                , const FunctorType & functor
+                , const unsigned      dependence_capacity /* = default */ ) const ;
+
+  /** \brief  Create a reduce task with storage for dependences. */
+  template< class ExecPolicy , class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create_reduce( const ExecPolicy  & policy
+               , const FunctorType & functor
+               , const unsigned      dependence_capacity /* = default */ ) const ;
+
+  /** \brief  Create a scan task with storage for dependences. */
+  template< class ExecPolicy , class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create_scan( const ExecPolicy  & policy
+             , const FunctorType & functor
+             , const unsigned      dependence_capacity /* = default */ ) const ;
+
+  /** \brief  Set dependence that 'after' cannot start execution
+   *          until 'before' has completed.
+   *
+   *  Precondition: The 'after' task must be in then 'Constructing' state.
+   */
+  template< class TA , class TB >
+  void set_dependence( const Future<TA,execution_space> & after
+                     , const Future<TB,execution_space> & before ) const ;
+
+  /** \brief  Spawn a task in the 'Constructing' state
+   *
+   *  Precondition:  Task is in the 'constructing' state.
+   *  Postcondition: Task is waiting, executing, or complete.
+   */
+  template< class T >
+  const Future<T,execution_space> &
+  spawn( const Future<T,execution_space> & ) const ;
+
+  //----------------------------------------
+  /** \brief  Query dependence of an executing task */
+
+  template< class FunctorType >
+  Future< execution_space >
+  get_dependence( FunctorType * , const int ) const ;
+
+  //----------------------------------------
+  /** \brief  Clear current dependences of an executing task
+   *          in preparation for setting new dependences and
+   *          respawning.
+   *
+   * Precondition: The functor must be a task in the executing state.
+   */
+  template< class FunctorType >
+  void clear_dependence( FunctorType * ) const ;
+
+  /** \brief  Set dependence that 'after' cannot start execution
+   *          until 'before' has completed.
+   *
+   *  The 'after' functor must be in the executing state
+   */
+  template< class FunctorType , class TB >
+  void set_dependence( FunctorType * after
+                     , const Future<TB,execution_space> & before ) const ;
+
+  /** \brief  Respawn (reschedule) an executing task to be called again
+   *          after all dependences have completed.
+   */
+  template< class FunctorType >
+  void respawn( FunctorType * ) const ;
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Create and spawn a single-thread task */
+template< class ExecSpace , class FunctorType >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn( TaskPolicy<ExecSpace> & policy , const FunctorType & functor )
+{ return policy.spawn( policy.create( functor ) ); }
+
+/** \brief  Create and spawn a single-thread task with dependences */
+template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn( TaskPolicy<ExecSpace>   & policy
+     , const FunctorType       & functor
+     , const Future<Arg0,Arg1> & before_0
+     , const Future<Arg0,Arg1> & before_1 )
+{
+  Future< typename FunctorType::value_type , ExecSpace > f ;
+  f = policy.create( functor , 2 );
+  policy.add_dependence( f , before_0 );
+  policy.add_dependence( f , before_1 );
+  policy.spawn( f );
+  return f ;
+}
+
+//----------------------------------------------------------------------------
+/** \brief  Create and spawn a parallel_for task */
+template< class ExecSpace , class ParallelPolicyType , class FunctorType >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn_foreach( TaskPolicy<ExecSpace>     & task_policy
+             , const ParallelPolicyType  & parallel_policy
+             , const FunctorType         & functor )
+{ return task_policy.spawn( task_policy.create_foreach( parallel_policy , functor ) ); }
+
+/** \brief  Create and spawn a parallel_reduce task */
+template< class ExecSpace , class ParallelPolicyType , class FunctorType >
+inline
+Future< typename FunctorType::value_type , ExecSpace >
+spawn_reduce( TaskPolicy<ExecSpace>     & task_policy
+            , const ParallelPolicyType  & parallel_policy
+            , const FunctorType         & functor )
+{ return task_policy.spawn( task_policy.create_reduce( parallel_policy , functor ) ); }
+
+//----------------------------------------------------------------------------
+/** \brief  Respawn a task functor with dependences */
+template< class ExecSpace , class FunctorType , class Arg0 , class Arg1 >
+inline
+void respawn( TaskPolicy<ExecSpace>   & policy
+            , FunctorType *             functor
+            , const Future<Arg0,Arg1> & before_0
+            , const Future<Arg0,Arg1> & before_1
+            )
+{
+  policy.clear_dependence( functor );
+  policy.add_dependence( functor , before_0 );
+  policy.add_dependence( functor , before_1 );
+  policy.respawn( functor );
+}
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+void wait( TaskPolicy< ExecSpace > & );
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_TASKPOLICY_HPP */
+
diff --git a/lib/kokkos/core/src/Kokkos_Threads.hpp b/lib/kokkos/core/src/Kokkos_Threads.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..4661b714b235d3426b63dc5dcba7e77d514c2258
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Threads.hpp
@@ -0,0 +1,217 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_HPP
+#define KOKKOS_THREADS_HPP
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+#include <cstddef>
+#include <iosfwd>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_ScratchSpace.hpp>
+#include <Kokkos_Layout.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+class ThreadsExec ;
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/** \brief  Execution space for a pool of Pthreads or C11 threads on a CPU. */
+class Threads {
+public:
+  //! \name Type declarations that all Kokkos devices must provide.
+  //@{
+  //! Tag this class as a kokkos execution space
+  typedef Threads                  execution_space ;
+  typedef Kokkos::HostSpace        memory_space ;
+
+  //! This execution space preferred device_type
+  typedef Kokkos::Device<execution_space,memory_space> device_type;
+
+  typedef Kokkos::LayoutRight      array_layout ;
+  typedef memory_space::size_type  size_type ;
+
+  typedef ScratchMemorySpace< Threads >  scratch_memory_space ;
+
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  //! \name Static functions that all Kokkos devices must implement.
+  //@{
+
+  /// \brief True if and only if this method is being called in a
+  ///   thread-parallel function.
+  static int in_parallel();
+
+  /** \brief  Set the device in a "sleep" state.
+   *
+   * This function sets the device in a "sleep" state in which it is
+   * not ready for work.  This may consume less resources than if the
+   * device were in an "awake" state, but it may also take time to
+   * bring the device from a sleep state to be ready for work.
+   *
+   * \return True if the device is in the "sleep" state, else false if
+   *   the device is actively working and could not enter the "sleep"
+   *   state.
+   */
+  static bool sleep();
+
+  /// \brief Wake the device from the 'sleep' state so it is ready for work.
+  ///
+  /// \return True if the device is in the "ready" state, else "false"
+  ///  if the device is actively working (which also means that it's
+  ///  awake).
+  static bool wake();
+
+  /// \brief Wait until all dispatched functors complete.
+  ///
+  /// The parallel_for or parallel_reduce dispatch of a functor may
+  /// return asynchronously, before the functor completes.  This
+  /// method does not return until all dispatched functors on this
+  /// device have completed.
+  static void fence();
+
+  /// \brief Free any resources being consumed by the device.
+  ///
+  /// For the Threads device, this terminates spawned worker threads.
+  static void finalize();
+
+  /// \brief Print configuration information to the given output stream.
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //@}
+  /*------------------------------------------------------------------------*/
+  /*------------------------------------------------------------------------*/
+  //! \name Space-specific functions
+  //@{
+
+  /** \brief Initialize the device in the "ready to work" state.
+   *
+   *  The device is initialized in a "ready to work" or "awake" state.
+   *  This state reduces latency and thus improves performance when
+   *  dispatching work.  However, the "awake" state consumes resources
+   *  even when no work is being done.  You may call sleep() to put
+   *  the device in a "sleeping" state that does not consume as many
+   *  resources, but it will take time (latency) to awaken the device
+   *  again (via the wake()) method so that it is ready for work.
+   *
+   *  Teams of threads are distributed as evenly as possible across
+   *  the requested number of numa regions and cores per numa region.
+   *  A team will not be split across a numa region.
+   *
+   *  If the 'use_' arguments are not supplied the hwloc is queried
+   *  to use all available cores.
+   */
+  static void initialize( unsigned threads_count = 0 ,
+                          unsigned use_numa_count = 0 ,
+                          unsigned use_cores_per_numa = 0 ,
+                          bool allow_asynchronous_threadpool = false );
+
+  static int is_initialized();
+
+  static Threads & instance( int = 0 );
+
+  //----------------------------------------
+
+  static int thread_pool_size( int depth = 0 );
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static int thread_pool_rank();
+#else
+  KOKKOS_INLINE_FUNCTION static int thread_pool_rank() { return 0 ; }
+#endif
+
+  inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
+  KOKKOS_INLINE_FUNCTION static unsigned hardware_thread_id() { return thread_pool_rank(); }
+
+  //@}
+  //----------------------------------------
+};
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct VerifyExecutionCanAccessMemorySpace
+  < Kokkos::Threads::memory_space
+  , Kokkos::Threads::scratch_memory_space
+  >
+{
+  enum { value = true };
+  inline static void verify( void ) { }
+  inline static void verify( const void * ) { }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+#include <Kokkos_ExecPolicy.hpp>
+#include <Kokkos_Parallel.hpp>
+#include <Threads/Kokkos_ThreadsExec.hpp>
+#include <Threads/Kokkos_ThreadsTeam.hpp>
+#include <Threads/Kokkos_Threads_Parallel.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+#endif /* #define KOKKOS_THREADS_HPP */
+
+
diff --git a/lib/kokkos/core/src/Kokkos_Vectorization.hpp b/lib/kokkos/core/src/Kokkos_Vectorization.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..a60c0ecaa7b83bd49fb187bf37ca5a84d6360744
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_Vectorization.hpp
@@ -0,0 +1,53 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/// \file Kokkos_Vectorization.hpp
+/// \brief Declaration and definition of Kokkos::Vectorization interface.
+#ifndef KOKKOS_VECTORIZATION_HPP
+#define KOKKOS_VECTORIZATION_HPP
+
+#if defined( KOKKOS_HAVE_CUDA )
+#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
+#endif
+
+#endif
diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..cd6c8af9fedffb849e0cb8de8a5160e8557d1ffe
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_View.hpp
@@ -0,0 +1,1915 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEW_HPP
+#define KOKKOS_VIEW_HPP
+
+#include <string>
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_HostSpace.hpp>
+#include <Kokkos_MemoryTraits.hpp>
+
+#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Shape.hpp>
+#include <impl/Kokkos_AnalyzeShape.hpp>
+#include <impl/Kokkos_ViewOffset.hpp>
+#include <impl/Kokkos_ViewSupport.hpp>
+#include <impl/Kokkos_Tags.hpp>
+#include <type_traits>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  View specialization mapping of view traits to a specialization tag */
+template< class ValueType ,
+          class ArraySpecialize ,
+          class ArrayLayout ,
+          class MemorySpace ,
+          class MemoryTraits >
+struct ViewSpecialize ;
+
+/** \brief  Defines the type of a subview given a source view type
+ *          and subview argument types.
+ */
+template< class SrcViewType
+        , class Arg0Type
+        , class Arg1Type
+        , class Arg2Type
+        , class Arg3Type
+        , class Arg4Type
+        , class Arg5Type
+        , class Arg6Type
+        , class Arg7Type
+        >
+struct ViewSubview /* { typedef ... type ; } */ ;
+
+template< class DstViewSpecialize ,
+          class SrcViewSpecialize = void ,
+          class Enable = void >
+struct ViewAssignment ;
+
+template< class DstMemorySpace , class SrcMemorySpace >
+struct DeepCopy ;
+
+} /* namespace Impl */
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \class ViewTraits
+ *  \brief Traits class for accessing attributes of a View.
+ *
+ * This is an implementation detail of View.  It is only of interest
+ * to developers implementing a new specialization of View.
+ *
+ * Template argument permutations:
+ *   - View< DataType , void         , void         , void >
+ *   - View< DataType , Space        , void         , void >
+ *   - View< DataType , Space        , MemoryTraits , void >
+ *   - View< DataType , Space        , void         , MemoryTraits >
+ *   - View< DataType , ArrayLayout  , void         , void >
+ *   - View< DataType , ArrayLayout  , Space        , void >
+ *   - View< DataType , ArrayLayout  , MemoryTraits , void   >
+ *   - View< DataType , ArrayLayout  , Space        , MemoryTraits >
+ *   - View< DataType , MemoryTraits , void         , void  >
+ */
+
+template< class DataType ,
+          class Arg1 = void ,
+          class Arg2 = void ,
+          class Arg3 = void >
+class ViewTraits {
+private:
+
+  // Layout, Space, and MemoryTraits are optional
+  // but need to appear in that order. That means Layout
+  // can only be Arg1, Space can be Arg1 or Arg2, and
+  // MemoryTraits can be Arg1, Arg2 or Arg3
+
+  enum { Arg1IsLayout = Impl::is_array_layout<Arg1>::value };
+
+  enum { Arg1IsSpace = Impl::is_space<Arg1>::value };
+  enum { Arg2IsSpace = Impl::is_space<Arg2>::value };
+
+  enum { Arg1IsMemoryTraits = Impl::is_memory_traits<Arg1>::value };
+  enum { Arg2IsMemoryTraits = Impl::is_memory_traits<Arg2>::value };
+  enum { Arg3IsMemoryTraits = Impl::is_memory_traits<Arg3>::value };
+
+  enum { Arg1IsVoid = Impl::is_same< Arg1 , void >::value };
+  enum { Arg2IsVoid = Impl::is_same< Arg2 , void >::value };
+  enum { Arg3IsVoid = Impl::is_same< Arg3 , void >::value };
+
+  // Arg1 is Layout, Space, MemoryTraits, or void
+  typedef typename
+    Impl::StaticAssert<
+      ( 1 == Arg1IsLayout + Arg1IsSpace + Arg1IsMemoryTraits + Arg1IsVoid )
+      , Arg1 >::type Arg1Verified ;
+
+  // If Arg1 is Layout       then Arg2 is Space, MemoryTraits, or void
+  // If Arg1 is Space        then Arg2 is MemoryTraits or void
+  // If Arg1 is MemoryTraits then Arg2 is void
+  // If Arg1 is Void         then Arg2 is void
+  typedef typename
+    Impl::StaticAssert<
+      ( Arg1IsLayout       && ( 1 == Arg2IsSpace + Arg2IsMemoryTraits + Arg2IsVoid ) ) ||
+      ( Arg1IsSpace        && ( 0 == Arg2IsSpace ) && ( 1 == Arg2IsMemoryTraits + Arg2IsVoid ) ) ||
+      ( Arg1IsMemoryTraits && Arg2IsVoid ) ||
+      ( Arg1IsVoid         && Arg2IsVoid )
+      , Arg2 >::type Arg2Verified ;
+
+  // Arg3 is MemoryTraits or void and at most one argument is MemoryTraits
+  typedef typename
+    Impl::StaticAssert<
+      ( 1 == Arg3IsMemoryTraits + Arg3IsVoid ) &&
+      ( Arg1IsMemoryTraits + Arg2IsMemoryTraits + Arg3IsMemoryTraits <= 1 )
+      , Arg3 >::type Arg3Verified ;
+
+  // Arg1 or Arg2 may have execution and memory spaces
+  typedef typename Impl::if_c<( Arg1IsSpace ), Arg1Verified ,
+          typename Impl::if_c<( Arg2IsSpace ), Arg2Verified ,
+          Kokkos::DefaultExecutionSpace
+          >::type >::type::execution_space  ExecutionSpace ;
+
+  typedef typename Impl::if_c<( Arg1IsSpace ), Arg1Verified ,
+          typename Impl::if_c<( Arg2IsSpace ), Arg2Verified ,
+          Kokkos::DefaultExecutionSpace
+          >::type >::type::memory_space  MemorySpace ;
+
+  typedef typename Impl::is_space<
+    typename Impl::if_c<( Arg1IsSpace ), Arg1Verified ,
+    typename Impl::if_c<( Arg2IsSpace ), Arg2Verified ,
+    Kokkos::DefaultExecutionSpace
+    >::type >::type >::host_mirror_space  HostMirrorSpace ;
+
+  // Arg1 may be array layout
+  typedef typename Impl::if_c< Arg1IsLayout , Arg1Verified ,
+          typename ExecutionSpace::array_layout
+          >::type ArrayLayout ;
+
+  // Arg1, Arg2, or Arg3 may be memory traits
+  typedef typename Impl::if_c< Arg1IsMemoryTraits , Arg1Verified ,
+          typename Impl::if_c< Arg2IsMemoryTraits , Arg2Verified ,
+          typename Impl::if_c< Arg3IsMemoryTraits , Arg3Verified ,
+          MemoryManaged
+          >::type >::type >::type  MemoryTraits ;
+
+  typedef Impl::AnalyzeShape<DataType> analysis ;
+
+public:
+
+  //------------------------------------
+  // Data type traits:
+
+  typedef DataType                            data_type ;
+  typedef typename analysis::const_type       const_data_type ;
+  typedef typename analysis::non_const_type   non_const_data_type ;
+
+  //------------------------------------
+  // Array of intrinsic scalar type traits:
+
+  typedef typename analysis::array_intrinsic_type            array_intrinsic_type ;
+  typedef typename analysis::const_array_intrinsic_type      const_array_intrinsic_type ;
+  typedef typename analysis::non_const_array_intrinsic_type  non_const_array_intrinsic_type ;
+
+  //------------------------------------
+  // Value type traits:
+
+  typedef typename analysis::value_type            value_type ;
+  typedef typename analysis::const_value_type      const_value_type ;
+  typedef typename analysis::non_const_value_type  non_const_value_type ;
+
+  //------------------------------------
+  // Layout and shape traits:
+
+  typedef ArrayLayout                array_layout ;
+  typedef typename analysis::shape   shape_type ;
+
+  enum { rank         = shape_type::rank };
+  enum { rank_dynamic = shape_type::rank_dynamic };
+
+  //------------------------------------
+  // Execution space, memory space, memory access traits, and host mirror space.
+
+  typedef ExecutionSpace   execution_space ;
+  typedef MemorySpace      memory_space ;
+  typedef Device<ExecutionSpace,MemorySpace>  device_type ;
+  typedef MemoryTraits     memory_traits ;
+  typedef HostMirrorSpace  host_mirror_space ;
+
+  typedef typename memory_space::size_type  size_type ;
+
+  enum { is_hostspace      = Impl::is_same< memory_space , HostSpace >::value };
+  enum { is_managed        = memory_traits::Unmanaged == 0 };
+  enum { is_random_access  = memory_traits::RandomAccess == 1 };
+
+  //------------------------------------
+
+
+  //------------------------------------
+  // Specialization tag:
+
+  typedef typename
+    Impl::ViewSpecialize< value_type
+                        , typename analysis::specialize
+                        , array_layout
+                        , memory_space
+                        , memory_traits
+                        >::type specialize ;
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class ViewDefault {};
+
+/** \brief  Default view specialization has LayoutLeft, LayoutRight, or LayoutStride.
+ */
+template< class ValueType , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< ValueType , void , LayoutLeft , MemorySpace , MemoryTraits >
+{ typedef ViewDefault type ; };
+
+template< class ValueType , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< ValueType , void , LayoutRight , MemorySpace , MemoryTraits >
+{ typedef ViewDefault type ; };
+
+template< class ValueType , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< ValueType , void , LayoutStride , MemorySpace , MemoryTraits >
+{ typedef ViewDefault type ; };
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief Types for compile-time detection of View usage errors */
+namespace ViewError {
+
+struct allocation_constructor_requires_managed {};
+struct allocation_constructor_requires_nonconst {};
+struct user_pointer_constructor_requires_unmanaged {};
+struct device_shmem_constructor_requires_unmanaged {};
+
+struct scalar_operator_called_from_non_scalar_view {};
+
+} /* namespace ViewError */
+
+//----------------------------------------------------------------------------
+/** \brief  Enable view parentheses operator for
+ *          match of layout and integral arguments.
+ *          If correct rank define type from traits,
+ *          otherwise define type as an error message.
+ */
+template< class ReturnType , class Traits , class Layout , unsigned Rank ,
+          typename iType0 = int , typename iType1 = int ,
+          typename iType2 = int , typename iType3 = int ,
+          typename iType4 = int , typename iType5 = int ,
+          typename iType6 = int , typename iType7 = int ,
+          class Enable = void >
+struct ViewEnableArrayOper ;
+
+template< class ReturnType , class Traits , class Layout , unsigned Rank ,
+          typename iType0 , typename iType1 ,
+          typename iType2 , typename iType3 ,
+          typename iType4 , typename iType5 ,
+          typename iType6 , typename iType7 >
+struct ViewEnableArrayOper<
+   ReturnType , Traits , Layout , Rank ,
+   iType0 , iType1 , iType2 , iType3 ,
+   iType4 , iType5 , iType6 , iType7 ,
+   typename enable_if<
+     iType0(0) == 0 && iType1(0) == 0 && iType2(0) == 0 && iType3(0) == 0 &&
+     iType4(0) == 0 && iType5(0) == 0 && iType6(0) == 0 && iType7(0) == 0 &&
+     is_same< typename Traits::array_layout , Layout >::value &&
+     ( unsigned(Traits::rank) == Rank )
+   >::type >
+{
+  typedef ReturnType type ;
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \class View
+ *  \brief View to an array of data.
+ *
+ * A View represents an array of one or more dimensions.
+ * For details, please refer to Kokkos' tutorial materials.
+ *
+ * \section Kokkos_View_TemplateParameters Template parameters
+ *
+ * This class has both required and optional template parameters.  The
+ * \c DataType parameter must always be provided, and must always be
+ * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are
+ * placeholders for different template parameters.  The default value
+ * of the fifth template parameter \c Specialize suffices for most use
+ * cases.  When explaining the template parameters, we won't refer to
+ * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer
+ * to the valid categories of template parameters, in whatever order
+ * they may occur.
+ *
+ * Valid ways in which template arguments may be specified:
+ *   - View< DataType , Space >
+ *   - View< DataType , Space  ,         MemoryTraits >
+ *   - View< DataType , Space  , void  , MemoryTraits >
+ *   - View< DataType , Layout , Space >
+ *   - View< DataType , Layout , Space , MemoryTraits >
+ *
+ * \tparam DataType (required) This indicates both the type of each
+ *   entry of the array, and the combination of compile-time and
+ *   run-time array dimension(s).  For example, <tt>double*</tt>
+ *   indicates a one-dimensional array of \c double with run-time
+ *   dimension, and <tt>int*[3]</tt> a two-dimensional array of \c int
+ *   with run-time first dimension and compile-time second dimension
+ *   (of 3).  In general, the run-time dimensions (if any) must go
+ *   first, followed by zero or more compile-time dimensions.  For
+ *   more examples, please refer to the tutorial materials.
+ *
+ * \tparam Space (required) The memory space.
+ *
+ * \tparam Layout (optional) The array's layout in memory.  For
+ *   example, LayoutLeft indicates a column-major (Fortran style)
+ *   layout, and LayoutRight a row-major (C style) layout.  If not
+ *   specified, this defaults to the preferred layout for the
+ *   <tt>Space</tt>.
+ *
+ * \tparam MemoryTraits (optional) Assertion of the user's intended
+ *   access behavior.  For example, RandomAccess indicates read-only
+ *   access with limited spatial locality, and Unmanaged lets users
+ *   wrap externally allocated memory in a View without automatic
+ *   deallocation.
+ *
+ * \section Kokkos_View_MT MemoryTraits discussion
+ *
+ * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on Space
+ *
+ * Some \c MemoryTraits options may have different interpretations for
+ * different \c Space types.  For example, with the Cuda device,
+ * \c RandomAccess tells Kokkos to fetch the data through the texture
+ * cache, whereas the non-GPU devices have no such hardware construct.
+ *
+ * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits
+ *
+ * Users should defer applying the optional \c MemoryTraits parameter
+ * until the point at which they actually plan to rely on it in a
+ * computational kernel.  This minimizes the number of template
+ * parameters exposed in their code, which reduces the cost of
+ * compilation.  Users may always assign a View without specified
+ * \c MemoryTraits to a compatible View with that specification.
+ * For example:
+ * \code
+ * // Pass in the simplest types of View possible.
+ * void
+ * doSomething (View<double*, Cuda> out,
+ *              View<const double*, Cuda> in)
+ * {
+ *   // Assign the "generic" View in to a RandomAccess View in_rr.
+ *   // Note that RandomAccess View objects must have const data.
+ *   View<const double*, Cuda, RandomAccess> in_rr = in;
+ *   // ... do something with in_rr and out ...
+ * }
+ * \endcode
+ */
+template< class DataType ,
+          class Arg1Type = void , /* ArrayLayout, SpaceType, or MemoryTraits */
+          class Arg2Type = void , /* SpaceType or MemoryTraits */
+          class Arg3Type = void , /* MemoryTraits */
+          class Specialize =
+            typename ViewTraits<DataType,Arg1Type,Arg2Type,Arg3Type>::specialize >
+class View ;
+
+namespace Impl {
+
+template< class C >
+struct is_view : public bool_< false > {};
+
+template< class D , class A1 , class A2 , class A3 , class S >
+struct is_view< View< D , A1 , A2 , A3 , S > > : public bool_< true > {};
+
+}
+
+//----------------------------------------------------------------------------
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          class Arg3Type >
+class View< DataType , Arg1Type , Arg2Type , Arg3Type , Impl::ViewDefault >
+  : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
+{
+public:
+
+  typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
+
+private:
+
+  // Assignment of compatible views requirement:
+  template< class , class , class , class , class > friend class View ;
+
+  // Assignment of compatible subview requirement:
+  template< class , class , class > friend struct Impl::ViewAssignment ;
+
+  // Dimensions, cardinality, capacity, and offset computation for
+  // multidimensional array view of contiguous memory.
+  // Inherits from Impl::Shape
+  typedef Impl::ViewOffset< typename traits::shape_type
+                          , typename traits::array_layout
+                          > offset_map_type ;
+
+  // Intermediary class for data management and access
+  typedef Impl::ViewDataManagement< traits > view_data_management ;
+
+  //----------------------------------------
+  // Data members:
+
+  typename view_data_management::handle_type  m_ptr_on_device ;
+  offset_map_type                             m_offset_map ;
+  view_data_management                        m_management ;
+  Impl::AllocationTracker                     m_tracker ;
+
+  //----------------------------------------
+
+public:
+
+  /** return type for all indexing operators */
+  typedef typename view_data_management::return_type reference_type ;
+
+  enum { reference_type_is_lvalue = view_data_management::ReturnTypeIsReference };
+
+  typedef View< typename traits::array_intrinsic_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > array_type ;
+
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > const_type ;
+
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::device_type ,
+                typename traits::memory_traits > non_const_type ;
+
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::host_mirror_space ,
+                void > HostMirror ;
+
+  //------------------------------------
+  // Shape
+
+  enum { Rank = traits::rank };
+
+  KOKKOS_INLINE_FUNCTION offset_map_type shape() const { return m_offset_map ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type size() const { return m_offset_map.cardinality(); }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type dimension( const iType & i ) const
+    { return Impl::dimension( m_offset_map , i ); }
+
+  //------------------------------------
+  // Destructor, constructors, assignment operators:
+
+  KOKKOS_INLINE_FUNCTION
+  ~View() {}
+
+  KOKKOS_INLINE_FUNCTION
+  View()
+    : m_ptr_on_device()
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+    { m_offset_map.assign(0, 0,0,0,0,0,0,0,0); }
+
+  KOKKOS_INLINE_FUNCTION
+  View( const View & rhs )
+    : m_ptr_on_device()
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+    {
+      (void) Impl::ViewAssignment<
+         typename traits::specialize ,
+         typename traits::specialize >( *this , rhs );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View & rhs )
+    {
+      (void) Impl::ViewAssignment<
+         typename traits::specialize ,
+         typename traits::specialize >( *this , rhs );
+      return *this ;
+    }
+
+  //------------------------------------
+  // Construct or assign compatible view:
+
+  template< class RT , class RL , class RD , class RM , class RS >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<RT,RL,RD,RM,RS> & rhs )
+    : m_ptr_on_device()
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+    {
+      (void) Impl::ViewAssignment<
+         typename traits::specialize , RS >( *this , rhs );
+    }
+
+  template< class RT , class RL , class RD , class RM , class RS >
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View<RT,RL,RD,RM,RS> & rhs )
+    {
+      (void) Impl::ViewAssignment<
+         typename traits::specialize , RS >( *this , rhs );
+      return *this ;
+    }
+
+  //------------------------------------
+  /**\brief Allocation of a managed view with possible alignment padding.
+   *
+   *  Allocation properties for allocating and initializing to the default value_type:
+   *    Kokkos::ViewAllocate()
+   *    Kokkos::ViewAllocate("label")  OR  "label"
+   *    Kokkos::ViewAllocate(std::string("label"))  OR  std::string("label")
+   *
+   *  Allocation properties for allocating and bypassing initialization:
+   *    Kokkos::ViewAllocateWithoutInitializing()
+   *    Kokkos::ViewAllocateWithoutInitializing("label")
+   */
+
+  template< class AllocationProperties >
+  explicit inline
+  View( const AllocationProperties & prop ,
+        // Impl::ViewAllocProp::size_type exists when the traits and allocation properties
+        // are valid for allocating viewed memory.
+        const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type n0 = 0 ,
+        const size_t n1 = 0 ,
+        const size_t n2 = 0 ,
+        const size_t n3 = 0 ,
+        const size_t n4 = 0 ,
+        const size_t n5 = 0 ,
+        const size_t n6 = 0 ,
+        const size_t n7 = 0 ,
+        const size_t n8 = 0 )
+    : m_ptr_on_device()
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+    {
+      typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ;
+
+      static_assert(!std::is_same<typename traits::array_layout, LayoutStride>::value,
+                         "LayoutStride does not support View constructor which takes dimensions directly!");
+
+      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 );
+      if(Alloc::AllowPadding)
+        m_offset_map.set_padding();
+
+      m_ptr_on_device = view_data_management::template allocate< Alloc::Initialize >( Alloc::label(prop) , m_offset_map, m_tracker );
+
+    }
+
+  template< class AllocationProperties >
+  explicit inline
+  View( const AllocationProperties & prop ,
+        const typename traits::array_layout & layout ,
+        // Impl::ViewAllocProp::size_type exists when the traits and allocation properties
+        // are valid for allocating viewed memory.
+        const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type = 0 )
+    : m_ptr_on_device()
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+    {
+      typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ;
+
+      m_offset_map.assign( layout );
+      if(Alloc::AllowPadding)
+        m_offset_map.set_padding();
+
+      m_ptr_on_device = view_data_management::template allocate< Alloc::Initialize >( Alloc::label(prop) , m_offset_map, m_tracker );
+
+      m_management.set_noncontiguous();
+    }
+
+  //------------------------------------
+  // Assign an unmanaged View from pointer, can be called in functors.
+  // No alignment padding is performed.
+
+  template< class Type >
+  explicit KOKKOS_INLINE_FUNCTION
+  View( Type * ptr ,
+        typename Impl::ViewRawPointerProp< traits , Type >::size_type n0 = 0 ,
+        const size_t n1 = 0 ,
+        const size_t n2 = 0 ,
+        const size_t n3 = 0 ,
+        const size_t n4 = 0 ,
+        const size_t n5 = 0 ,
+        const size_t n6 = 0 ,
+        const size_t n7 = 0 ,
+        const size_t n8 = 0 )
+    : m_ptr_on_device(ptr)
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+    {
+      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 );
+      m_management.set_unmanaged();
+    }
+
+  template< class Type >
+  explicit KOKKOS_INLINE_FUNCTION
+  View( Type * ptr ,
+        typename traits::array_layout const & layout ,
+        typename Impl::ViewRawPointerProp< traits , Type >::size_type = 0 )
+    : m_ptr_on_device(ptr)
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+    {
+      m_offset_map.assign( layout );
+      m_management.set_unmanaged();
+      m_management.set_noncontiguous();
+    }
+
+
+
+  //------------------------------------
+  // Assign a View from an AllocationTracker,
+  // The allocator used must be compatiable with the memory space of the view
+  // No alignment padding is performed.
+  // TODO: Should these allow padding??? DJS 01/15/15
+  explicit
+  View( Impl::AllocationTracker const &arg_tracker ,
+        const size_t n0 = 0 ,
+        const size_t n1 = 0 ,
+        const size_t n2 = 0 ,
+        const size_t n3 = 0 ,
+        const size_t n4 = 0 ,
+        const size_t n5 = 0 ,
+        const size_t n6 = 0 ,
+        const size_t n7 = 0 ,
+        const size_t n8 = 0 )
+    : m_ptr_on_device(reinterpret_cast<typename traits::value_type*>(arg_tracker.alloc_ptr()))
+    , m_offset_map()
+    , m_management()
+    , m_tracker(arg_tracker)
+    {
+      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7, n8 );
+
+      const size_t req_size = m_offset_map.capacity() * sizeof(typename traits::value_type);
+      if ( m_tracker.alloc_size() < req_size ) {
+        Impl::throw_runtime_exception("Error: tracker.alloc_size() < req_size");
+      }
+    }
+
+  explicit
+  View( Impl::AllocationTracker const & arg_tracker
+      , typename traits::array_layout const & layout )
+    : m_ptr_on_device(reinterpret_cast<typename traits::value_type*>(arg_tracker.alloc_ptr()))
+    , m_offset_map()
+    , m_management()
+    , m_tracker(arg_tracker)
+    {
+      m_offset_map.assign( layout );
+
+      const size_t req_size = m_offset_map.capacity() * sizeof(typename traits::value_type);
+      if ( m_tracker.alloc_size() < req_size ) {
+        Impl::throw_runtime_exception("Error: tracker.alloc_size() < req_size");
+      }
+
+      m_management.set_noncontiguous();
+    }
+
+  //------------------------------------
+  /** \brief  Constructors for subviews requires following
+   *          type-compatibility condition, enforce via StaticAssert.
+   *
+   *  Impl::is_same< View ,
+   *                 typename Impl::ViewSubview< View<D,A1,A2,A3,Impl::ViewDefault>
+   *                                           , ArgType0 , ArgType1 , ArgType2 , ArgType3
+   *                                           , ArgType4 , ArgType5 , ArgType6 , ArgType7
+   *                 >::type >::value
+   */
+  template< class D , class A1 , class A2 , class A3
+          , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+          , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type
+          >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
+      , const SubArg0_type & arg0 , const SubArg1_type & arg1
+      , const SubArg2_type & arg2 , const SubArg3_type & arg3
+      , const SubArg4_type & arg4 , const SubArg5_type & arg5
+      , const SubArg6_type & arg6 , const SubArg7_type & arg7
+      );
+
+  template< class D , class A1 , class A2 , class A3
+          , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+          , class SubArg4_type , class SubArg5_type , class SubArg6_type
+          >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
+      , const SubArg0_type & arg0 , const SubArg1_type & arg1
+      , const SubArg2_type & arg2 , const SubArg3_type & arg3
+      , const SubArg4_type & arg4 , const SubArg5_type & arg5
+      , const SubArg6_type & arg6
+      );
+
+  template< class D , class A1 , class A2 , class A3
+          , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+          , class SubArg4_type , class SubArg5_type
+          >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
+      , const SubArg0_type & arg0 , const SubArg1_type & arg1
+      , const SubArg2_type & arg2 , const SubArg3_type & arg3
+      , const SubArg4_type & arg4 , const SubArg5_type & arg5
+      );
+
+  template< class D , class A1 , class A2 , class A3
+          , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+          , class SubArg4_type
+          >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
+      , const SubArg0_type & arg0 , const SubArg1_type & arg1
+      , const SubArg2_type & arg2 , const SubArg3_type & arg3
+      , const SubArg4_type & arg4
+      );
+
+  template< class D , class A1 , class A2 , class A3
+          , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+          >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
+      , const SubArg0_type & arg0 , const SubArg1_type & arg1
+      , const SubArg2_type & arg2 , const SubArg3_type & arg3
+      );
+
+  template< class D , class A1 , class A2 , class A3
+          , class SubArg0_type , class SubArg1_type , class SubArg2_type
+          >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
+      , const SubArg0_type & arg0 , const SubArg1_type & arg1
+      , const SubArg2_type & arg2
+      );
+
+  template< class D , class A1 , class A2 , class A3
+          , class SubArg0_type , class SubArg1_type
+          >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
+      , const SubArg0_type & arg0 , const SubArg1_type & arg1
+      );
+
+  template< class D , class A1 , class A2 , class A3
+          , class SubArg0_type
+          >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<D,A1,A2,A3,Impl::ViewDefault> & src
+      , const SubArg0_type & arg0
+      );
+
+  //------------------------------------
+  // Assign unmanaged View to portion of execution space's shared memory
+
+  typedef Impl::if_c< ! traits::is_managed ,
+                      const typename traits::execution_space::scratch_memory_space & ,
+                      Impl::ViewError::device_shmem_constructor_requires_unmanaged >
+      if_scratch_memory_constructor ;
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( typename if_scratch_memory_constructor::type space ,
+        const unsigned n0 = 0 ,
+        const unsigned n1 = 0 ,
+        const unsigned n2 = 0 ,
+        const unsigned n3 = 0 ,
+        const unsigned n4 = 0 ,
+        const unsigned n5 = 0 ,
+        const unsigned n6 = 0 ,
+        const unsigned n7 = 0 )
+    : m_ptr_on_device()
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+    {
+      typedef typename traits::value_type  value_type_ ;
+
+      enum { align = 8 };
+      enum { mask  = align - 1 };
+
+      m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+
+      typedef Impl::if_c< ! traits::is_managed ,
+                          value_type_ * ,
+                          Impl::ViewError::device_shmem_constructor_requires_unmanaged >
+        if_device_shmem_pointer ;
+
+      // Select the first argument:
+      m_ptr_on_device = if_device_shmem_pointer::select(
+       (value_type_*) space.get_shmem( unsigned( sizeof(value_type_) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) );
+    }
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( typename if_scratch_memory_constructor::type space ,
+        typename traits::array_layout const & layout)
+    : m_ptr_on_device()
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+    {
+      typedef typename traits::value_type  value_type_ ;
+
+      typedef Impl::if_c< ! traits::is_managed ,
+                          value_type_ * ,
+                          Impl::ViewError::device_shmem_constructor_requires_unmanaged >
+        if_device_shmem_pointer ;
+
+      m_offset_map.assign( layout );
+      m_management.set_unmanaged();
+      m_management.set_noncontiguous();
+
+      enum { align = 8 };
+      enum { mask  = align - 1 };
+
+      // Select the first argument:
+      m_ptr_on_device = if_device_shmem_pointer::select(
+       (value_type_*) space.get_shmem( unsigned( sizeof(value_type_) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) );
+    }
+
+  static inline
+  unsigned shmem_size( const unsigned n0 = 0 ,
+                       const unsigned n1 = 0 ,
+                       const unsigned n2 = 0 ,
+                       const unsigned n3 = 0 ,
+                       const unsigned n4 = 0 ,
+                       const unsigned n5 = 0 ,
+                       const unsigned n6 = 0 ,
+                       const unsigned n7 = 0 )
+  {
+    enum { align = 8 };
+    enum { mask  = align - 1 };
+
+    typedef typename traits::value_type  value_type_ ;
+
+    offset_map_type offset_map ;
+
+    offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+
+    return unsigned( sizeof(value_type_) * offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ;
+  }
+
+  //------------------------------------
+  // Is not allocated
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  bool is_null() const { return 0 == ptr_on_device() ; }
+
+  //------------------------------------
+  // Operators for scalar (rank zero) views.
+
+  typedef Impl::if_c< traits::rank == 0 ,
+                      typename traits::value_type ,
+                      Impl::ViewError::scalar_operator_called_from_non_scalar_view >
+    if_scalar_operator ;
+
+  KOKKOS_INLINE_FUNCTION
+  const View & operator = ( const typename if_scalar_operator::type & rhs ) const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      *m_ptr_on_device = if_scalar_operator::select( rhs );
+      return *this ;
+    }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  operator typename if_scalar_operator::type & () const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      return if_scalar_operator::select( *m_ptr_on_device );
+    }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename if_scalar_operator::type & operator()() const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      return if_scalar_operator::select( *m_ptr_on_device );
+    }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename if_scalar_operator::type & operator*() const
+    {
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+      return if_scalar_operator::select( *m_ptr_on_device );
+    }
+
+  //------------------------------------
+  // Array member access operators enabled if
+  // (1) a zero value of all argument types are compile-time comparable to zero
+  // (2) the rank matches the number of arguments
+  // (3) the memory space is valid for the access
+  //------------------------------------
+  // rank 1:
+  // Specialisation for LayoutLeft and LayoutRight since we know its stride 1
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutLeft, 1, iType0 >::type
+    operator[] ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ i0 ];
+    }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type , traits,  LayoutLeft, 1, iType0 >::type
+    operator() ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ i0 ];
+    }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutLeft, 1, iType0 >::type
+    at( const iType0 & i0 , const int , const int , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ i0 ];
+    }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutRight, 1, iType0 >::type
+    operator[] ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ i0 ];
+    }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type , traits,  LayoutRight, 1, iType0 >::type
+    operator() ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ i0 ];
+    }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type , traits, LayoutRight, 1, iType0 >::type
+    at( const iType0 & i0 , const int , const int , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ i0 ];
+    }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type , traits,
+                 typename Impl::if_c<
+                   Impl::is_same<typename traits::array_layout, LayoutRight>::value ||
+                   Impl::is_same<typename traits::array_layout, LayoutLeft>::value ,
+                   void, typename traits::array_layout>::type,
+                 1, iType0 >::type
+    operator[] ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0) ];
+    }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type , traits,
+                 typename Impl::if_c<
+                   Impl::is_same<typename traits::array_layout, LayoutRight>::value ||
+                   Impl::is_same<typename traits::array_layout, LayoutLeft>::value ,
+                   void, typename traits::array_layout>::type,
+                 1, iType0 >::type
+    operator() ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0) ];
+    }
+
+  template< typename iType0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type , traits,
+                 typename Impl::if_c<
+                   Impl::is_same<typename traits::array_layout, LayoutRight>::value ||
+                   Impl::is_same<typename traits::array_layout, LayoutLeft>::value ,
+                   void, typename traits::array_layout>::type,
+                 1, iType0 >::type
+    at( const iType0 & i0 , const int , const int , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_1( m_offset_map, i0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0) ];
+    }
+
+  // rank 2:
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 2, iType0, iType1 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1) ];
+    }
+
+  template< typename iType0 , typename iType1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 2, iType0, iType1 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const int , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0,i1 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1) ];
+    }
+
+  // rank 3:
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 3, iType0, iType1, iType2 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_3( m_offset_map, i0,i1,i2 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2) ];
+    }
+
+  // rank 4:
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 4, iType0, iType1, iType2, iType3 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_4( m_offset_map, i0,i1,i2,i3 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3) ];
+    }
+
+  // rank 5:
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3 , iType4 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 5, iType0, iType1, iType2, iType3 , iType4 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+        const iType4 & i4 , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_5( m_offset_map, i0,i1,i2,i3,i4 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4) ];
+    }
+
+  // rank 6:
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 6,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 6,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+        const iType4 & i4 , const iType5 & i5 , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_6( m_offset_map, i0,i1,i2,i3,i4,i5 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5) ];
+    }
+
+  // rank 7:
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 7,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 7,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+        const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_7( m_offset_map, i0,i1,i2,i3,i4,i5,i6 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6) ];
+    }
+
+  // rank 8:
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 8,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type
+    operator() ( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+                 const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ];
+    }
+
+  template< typename iType0 , typename iType1 , typename iType2 , typename iType3 ,
+            typename iType4 , typename iType5 , typename iType6 , typename iType7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< reference_type ,
+                                      traits, typename traits::array_layout, 8,
+                                      iType0, iType1, iType2, iType3 , iType4, iType5, iType6, iType7 >::type
+    at( const iType0 & i0 , const iType1 & i1 , const iType2 & i2 , const iType3 & i3 ,
+        const iType4 & i4 , const iType5 & i5 , const iType6 & i6 , const iType7 & i7 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_8( m_offset_map, i0,i1,i2,i3,i4,i5,i6,i7 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , ptr_on_device() );
+
+      return m_ptr_on_device[ m_offset_map(i0,i1,i2,i3,i4,i5,i6,i7) ];
+    }
+
+  //------------------------------------
+  // Access to the underlying contiguous storage of this view specialization.
+  // These methods are specific to specialization of a view.
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename traits::value_type * ptr_on_device() const
+    { return (typename traits::value_type *) m_ptr_on_device ; }
+
+  // Stride of physical storage, dimensioned to at least Rank
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+  { m_offset_map.stride(s); }
+
+  // Count of contiguously allocated data members including padding.
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type capacity() const
+  { return m_offset_map.capacity(); }
+
+  // If the view data can be treated (deep copied)
+  // as a contiguous block of memory.
+  KOKKOS_INLINE_FUNCTION
+  bool is_contiguous() const
+  { return m_management.is_contiguous(); }
+
+  const Impl::AllocationTracker & tracker() const { return m_tracker; }
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class LT , class LL , class LD , class LM , class LS ,
+          class RT , class RL , class RD , class RM , class RS >
+KOKKOS_INLINE_FUNCTION
+typename Impl::enable_if<( Impl::is_same< LS , RS >::value ), bool >::type
+operator == ( const View<LT,LL,LD,LM,LS> & lhs ,
+              const View<RT,RL,RD,RM,RS> & rhs )
+{
+  // Same data, layout, dimensions
+  typedef ViewTraits<LT,LL,LD,LM> lhs_traits ;
+  typedef ViewTraits<RT,RL,RD,RM> rhs_traits ;
+
+  return
+    Impl::is_same< typename lhs_traits::const_data_type ,
+                   typename rhs_traits::const_data_type >::value &&
+    Impl::is_same< typename lhs_traits::array_layout ,
+                   typename rhs_traits::array_layout >::value &&
+    Impl::is_same< typename lhs_traits::memory_space ,
+                   typename rhs_traits::memory_space >::value &&
+    Impl::is_same< typename lhs_traits::specialize ,
+                   typename rhs_traits::specialize >::value &&
+    lhs.ptr_on_device() == rhs.ptr_on_device() &&
+    lhs.shape()         == rhs.shape() ;
+}
+
+template< class LT , class LL , class LD , class LM , class LS ,
+          class RT , class RL , class RD , class RM , class RS >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const View<LT,LL,LD,LM,LS> & lhs ,
+                   const View<RT,RL,RD,RM,RS> & rhs )
+{
+  return ! operator==( lhs , rhs );
+}
+
+//----------------------------------------------------------------------------
+
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+/** \brief  Deep copy a value into a view.
+ */
+template< class DT , class DL , class DD , class DM , class DS >
+inline
+void deep_copy( const View<DT,DL,DD,DM,DS> & dst ,
+                typename Impl::enable_if<(
+                  Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::non_const_value_type ,
+                                 typename ViewTraits<DT,DL,DD,DM>::value_type >::value
+                ), typename ViewTraits<DT,DL,DD,DM>::const_value_type >::type & value )
+{
+  Impl::ViewFill< View<DT,DL,DD,DM,DS> >( dst , value );
+}
+
+template< class ST , class SL , class SD , class SM , class SS >
+inline
+typename Impl::enable_if<( ViewTraits<ST,SL,SD,SM>::rank == 0 )>::type
+deep_copy( ST & dst , const View<ST,SL,SD,SM,SS> & src )
+{
+  typedef  ViewTraits<ST,SL,SD,SM>  src_traits ;
+  typedef typename src_traits::memory_space  src_memory_space ;
+  Impl::DeepCopy< HostSpace , src_memory_space >( & dst , src.ptr_on_device() , sizeof(ST) );
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of compatible type, and rank zero.
+ */
+template< class DT , class DL , class DD , class DM , class DS ,
+          class ST , class SL , class SD , class SM , class SS >
+inline
+void deep_copy( const View<DT,DL,DD,DM,DS> & dst ,
+                const View<ST,SL,SD,SM,SS> & src ,
+                typename Impl::enable_if<(
+                  // Same type and destination is not constant:
+                  Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type ,
+                                 typename View<ST,SL,SD,SM,SS>::non_const_value_type >::value
+                  &&
+                  // Rank zero:
+                  ( unsigned(View<DT,DL,DD,DM,DS>::rank) == unsigned(0) ) &&
+                  ( unsigned(View<ST,SL,SD,SM,SS>::rank) == unsigned(0) )
+                )>::type * = 0 )
+{
+  typedef  View<DT,DL,DD,DM,DS>  dst_type ;
+  typedef  View<ST,SL,SD,SM,SS>  src_type ;
+
+  typedef typename dst_type::memory_space  dst_memory_space ;
+  typedef typename src_type::memory_space  src_memory_space ;
+  typedef typename src_type::value_type    value_type ;
+
+  if ( dst.ptr_on_device() != src.ptr_on_device() ) {
+    Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , sizeof(value_type) );
+  }
+}
+
+//----------------------------------------------------------------------------
+/** \brief  A deep copy between views of the default specialization, compatible type,
+ *          same non-zero rank, same contiguous layout.
+ */
+template< class DT , class DL , class DD , class DM ,
+          class ST , class SL , class SD , class SM >
+inline
+void deep_copy( const View<DT,DL,DD,DM,Impl::ViewDefault> & dst ,
+                const View<ST,SL,SD,SM,Impl::ViewDefault> & src ,
+                typename Impl::enable_if<(
+                  // Same type and destination is not constant:
+                  Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::value_type ,
+                                 typename View<ST,SL,SD,SM,Impl::ViewDefault>::non_const_value_type >::value
+                  &&
+                  // Same non-zero rank:
+                  ( unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) ==
+                    unsigned(View<ST,SL,SD,SM,Impl::ViewDefault>::rank) )
+                  &&
+                  ( 0 < unsigned(View<DT,DL,DD,DM,Impl::ViewDefault>::rank) )
+                  &&
+                  // Same layout:
+                  Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout ,
+                                 typename View<ST,SL,SD,SM,Impl::ViewDefault>::array_layout >::value
+                )>::type * = 0 )
+{
+  typedef  View<DT,DL,DD,DM,Impl::ViewDefault>  dst_type ;
+  typedef  View<ST,SL,SD,SM,Impl::ViewDefault>  src_type ;
+
+  typedef typename dst_type::memory_space  dst_memory_space ;
+  typedef typename src_type::memory_space  src_memory_space ;
+
+  enum { is_contiguous = // Contiguous (e.g., non-strided, non-tiled) layout
+           Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutLeft >::value ||
+           Impl::is_same< typename View<DT,DL,DD,DM,Impl::ViewDefault>::array_layout , LayoutRight >::value };
+
+  if ( dst.ptr_on_device() != src.ptr_on_device() ) {
+
+    // Same shape (dimensions)
+
+    const bool shapes_are_equal = dst.shape() == src.shape();
+
+    if ( shapes_are_equal && is_contiguous && dst.capacity() == src.capacity() ) {
+
+      // Views span equal length contiguous range.
+      // Assuming can perform a straight memory copy over this range.
+
+      const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity();
+
+      Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes );
+    }
+    else {
+      // Destination view's execution space must be able to directly access source memory space
+      // in order for the ViewRemap functor run in the destination memory space's execution space.
+      size_t stride[8];
+      src.stride(stride);
+      size_t size_stride = stride[0]*src.dimension_0();
+      size_t size_dim = src.dimension_0();
+      for(int i = 1; i<src.rank; i++) {
+        if(stride[i]*src.dimension(i)>size_stride)
+          size_stride = stride[i]*src.dimension(i);
+        size_dim*=src.dimension(i);
+      }
+
+      if( shapes_are_equal && size_stride == size_dim) {
+        const size_t nbytes = sizeof(typename dst_type::value_type) * dst.capacity();
+
+        Impl::DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes );
+      } else {
+        Impl::ViewRemap< dst_type , src_type >( dst , src );
+      }
+    }
+  }
+}
+
+
+/** \brief Deep copy equal dimension arrays in the same space which
+ *         have different layouts or specializations.
+ */
+template< class DT , class DL , class DD , class DM , class DS ,
+          class ST , class SL , class SD , class SM , class SS >
+inline
+void deep_copy( const View< DT, DL, DD, DM, DS > & dst ,
+                const View< ST, SL, SD, SM, SS > & src ,
+                const typename Impl::enable_if<(
+                  // Same type and destination is not constant:
+                  Impl::is_same< typename View<DT,DL,DD,DM,DS>::value_type ,
+                                 typename View<DT,DL,DD,DM,DS>::non_const_value_type >::value
+                  &&
+                  // Source memory space is accessible to destination memory space
+                  Impl::VerifyExecutionCanAccessMemorySpace< typename View<DT,DL,DD,DM,DS>::memory_space
+                                                           , typename View<ST,SL,SD,SM,SS>::memory_space >::value
+                  &&
+                  // Same non-zero rank
+                  ( unsigned( View<DT,DL,DD,DM,DS>::rank ) ==
+                    unsigned( View<ST,SL,SD,SM,SS>::rank ) )
+                  &&
+                  ( 0 < unsigned( View<DT,DL,DD,DM,DS>::rank ) )
+                  &&
+                  // Different layout or different specialization:
+                  ( ( ! Impl::is_same< typename View<DT,DL,DD,DM,DS>::array_layout ,
+                                       typename View<ST,SL,SD,SM,SS>::array_layout >::value )
+                    ||
+                    ( ! Impl::is_same< DS , SS >::value )
+                  )
+                )>::type * = 0 )
+{
+  typedef View< DT, DL, DD, DM, DS > dst_type ;
+  typedef View< ST, SL, SD, SM, SS > src_type ;
+
+  assert_shapes_equal_dimension( dst.shape() , src.shape() );
+
+  Impl::ViewRemap< dst_type , src_type >( dst , src );
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class T , class L , class D , class M , class S >
+typename Impl::enable_if<(
+    View<T,L,D,M,S>::is_managed &&
+    !Impl::is_same<L,LayoutStride>::value
+  ), typename View<T,L,D,M,S>::HostMirror >::type
+inline
+create_mirror( const View<T,L,D,M,S> & src )
+{
+  typedef View<T,L,D,M,S>                  view_type ;
+  typedef typename view_type::HostMirror    host_view_type ;
+
+  // 'view' is managed therefore we can allocate a
+  // compatible host_view through the ordinary constructor.
+
+  std::string label = src.tracker().label();
+  label.append("_mirror");
+
+  return host_view_type( label ,
+                         src.dimension_0() ,
+                         src.dimension_1() ,
+                         src.dimension_2() ,
+                         src.dimension_3() ,
+                         src.dimension_4() ,
+                         src.dimension_5() ,
+                         src.dimension_6() ,
+                         src.dimension_7() );
+}
+
+template< class T , class L , class D , class M , class S >
+typename Impl::enable_if<(
+    View<T,L,D,M,S>::is_managed &&
+    Impl::is_same<L,LayoutStride>::value
+  ), typename View<T,L,D,M,S>::HostMirror >::type
+inline
+create_mirror( const View<T,L,D,M,S> & src )
+{
+  typedef View<T,L,D,M,S>                  view_type ;
+  typedef typename view_type::HostMirror    host_view_type ;
+
+  // 'view' is managed therefore we can allocate a
+  // compatible host_view through the ordinary constructor.
+
+  std::string label = src.tracker().label();
+  label.append("_mirror");
+  LayoutStride layout;
+  src.stride(layout.stride);
+  layout.dimension[0] = src.dimension_0();
+  layout.dimension[1] = src.dimension_1();
+  layout.dimension[2] = src.dimension_2();
+  layout.dimension[3] = src.dimension_3();
+  layout.dimension[4] = src.dimension_4();
+  layout.dimension[5] = src.dimension_5();
+  layout.dimension[6] = src.dimension_6();
+  layout.dimension[7] = src.dimension_7();
+
+  return host_view_type( label , layout );
+}
+template< class T , class L , class D , class M , class S >
+typename Impl::enable_if<(
+    View<T,L,D,M,S>::is_managed &&
+    Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value
+  ), typename View<T,L,D,M,S>::HostMirror >::type
+inline
+create_mirror_view( const View<T,L,D,M,S> & src )
+{
+  return src ;
+}
+
+template< class T , class L , class D , class M , class S >
+typename Impl::enable_if<(
+    View<T,L,D,M,S>::is_managed &&
+    ! Impl::ViewAssignable< typename View<T,L,D,M,S>::HostMirror , View<T,L,D,M,S> >::value
+  ), typename View<T,L,D,M,S>::HostMirror >::type
+inline
+create_mirror_view( const View<T,L,D,M,S> & src )
+{
+  return create_mirror( src );
+}
+
+//----------------------------------------------------------------------------
+
+/** \brief  Resize a view with copying old data to new data at the corresponding indices. */
+template< class T , class L , class D , class M , class S >
+inline
+void resize( View<T,L,D,M,S> & v ,
+             const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 ,
+             const size_t n1 = 0 ,
+             const size_t n2 = 0 ,
+             const size_t n3 = 0 ,
+             const size_t n4 = 0 ,
+             const size_t n5 = 0 ,
+             const size_t n6 = 0 ,
+             const size_t n7 = 0 )
+{
+  typedef View<T,L,D,M,S> view_type ;
+
+  const std::string label = v.tracker().label();
+
+  view_type v_resized( label, n0, n1, n2, n3, n4, n5, n6, n7 );
+
+  Impl::ViewRemap< view_type , view_type >( v_resized , v );
+
+  v = v_resized ;
+}
+
+/** \brief  Reallocate a view without copying old data to new data */
+template< class T , class L , class D , class M , class S >
+inline
+void realloc( View<T,L,D,M,S> & v ,
+              const typename Impl::enable_if< ViewTraits<T,L,D,M>::is_managed , size_t >::type n0 ,
+              const size_t n1 = 0 ,
+              const size_t n2 = 0 ,
+              const size_t n3 = 0 ,
+              const size_t n4 = 0 ,
+              const size_t n5 = 0 ,
+              const size_t n6 = 0 ,
+              const size_t n7 = 0 )
+{
+  typedef View<T,L,D,M,S> view_type ;
+
+  // Query the current label and reuse it.
+  const std::string label = v.tracker().label();
+
+  v = view_type(); // deallocate first, if the only view to memory.
+  v = view_type( label, n0, n1, n2, n3, n4, n5, n6, n7 );
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+/** \brief  Tag denoting that a subview should capture all of a dimension */
+struct ALL { KOKKOS_INLINE_FUNCTION ALL(){} };
+
+template< class D , class A1 , class A2 , class A3 , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 , class ArgType6 , class ArgType7 >
+KOKKOS_INLINE_FUNCTION
+typename Impl::ViewSubview< View<D,A1,A2,A3,S>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , ArgType5 , ArgType6 , ArgType7
+                          >::type
+subview( const View<D,A1,A2,A3,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 ,
+         const ArgType6 & arg6 ,
+         const ArgType7 & arg7 )
+{
+  typedef typename
+    Impl::ViewSubview< View<D,A1,A2,A3,S>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , ArgType5 , ArgType6 , ArgType7
+                 >::type
+      DstViewType ;
+
+  return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7 );
+}
+
+template< class D , class A1 , class A2 , class A3 , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 , class ArgType6 >
+KOKKOS_INLINE_FUNCTION
+typename Impl::ViewSubview< View<D,A1,A2,A3,S>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , ArgType5 , ArgType6 , void
+                          >::type
+subview( const View<D,A1,A2,A3,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 ,
+         const ArgType6 & arg6 )
+{
+  typedef typename
+    Impl::ViewSubview< View<D,A1,A2,A3,S>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , ArgType5 , ArgType6 , void
+                 >::type
+      DstViewType ;
+
+  return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5, arg6 );
+}
+
+template< class D , class A1 , class A2 , class A3 , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 , class ArgType5 >
+KOKKOS_INLINE_FUNCTION
+typename Impl::ViewSubview< View<D,A1,A2,A3,S>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , ArgType5 , void , void
+                          >::type
+subview( const View<D,A1,A2,A3,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 ,
+         const ArgType5 & arg5 )
+{
+  typedef typename
+    Impl::ViewSubview< View<D,A1,A2,A3,S>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , ArgType5 , void , void
+                 >::type
+      DstViewType ;
+
+  return DstViewType( src, arg0, arg1, arg2, arg3, arg4, arg5 );
+}
+
+template< class D , class A1 , class A2 , class A3 , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 ,
+          class ArgType4 >
+KOKKOS_INLINE_FUNCTION
+typename Impl::ViewSubview< View<D,A1,A2,A3,S>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , ArgType4 , void , void , void
+                          >::type
+subview( const View<D,A1,A2,A3,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 ,
+         const ArgType4 & arg4 )
+{
+  typedef typename
+    Impl::ViewSubview< View<D,A1,A2,A3,S>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , ArgType4 , void , void , void
+                 >::type
+      DstViewType ;
+
+  return DstViewType( src, arg0, arg1, arg2, arg3, arg4 );
+}
+
+template< class D , class A1 , class A2 , class A3 , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 , class ArgType3 >
+KOKKOS_INLINE_FUNCTION
+typename Impl::ViewSubview< View<D,A1,A2,A3,S>
+                          , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                          , void , void , void , void
+                          >::type
+subview( const View<D,A1,A2,A3,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 ,
+         const ArgType3 & arg3 )
+{
+  typedef typename
+    Impl::ViewSubview< View<D,A1,A2,A3,S>
+                 , ArgType0 , ArgType1 , ArgType2 , ArgType3
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+
+  return DstViewType( src, arg0, arg1, arg2, arg3 );
+}
+
+template< class D , class A1 , class A2 , class A3 , class S ,
+          class ArgType0 , class ArgType1 , class ArgType2 >
+KOKKOS_INLINE_FUNCTION
+typename Impl::ViewSubview< View<D,A1,A2,A3,S>
+                          , ArgType0 , ArgType1 , ArgType2 , void
+                          , void , void , void , void
+                          >::type
+subview( const View<D,A1,A2,A3,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 ,
+         const ArgType2 & arg2 )
+{
+  typedef typename
+    Impl::ViewSubview< View<D,A1,A2,A3,S>
+                 , ArgType0 , ArgType1 , ArgType2 , void
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+
+  return DstViewType( src, arg0, arg1, arg2 );
+}
+
+template< class D , class A1 , class A2 , class A3 , class S ,
+          class ArgType0 , class ArgType1 >
+KOKKOS_INLINE_FUNCTION
+typename Impl::ViewSubview< View<D,A1,A2,A3,S>
+                          , ArgType0 , ArgType1 , void , void
+                          , void , void , void , void
+                          >::type
+subview( const View<D,A1,A2,A3,S> & src ,
+         const ArgType0 & arg0 ,
+         const ArgType1 & arg1 )
+{
+  typedef typename
+    Impl::ViewSubview< View<D,A1,A2,A3,S>
+                 , ArgType0 , ArgType1 , void , void
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+
+  return DstViewType( src, arg0, arg1 );
+}
+
+template< class D , class A1 , class A2 , class A3 , class S ,
+          class ArgType0 >
+KOKKOS_INLINE_FUNCTION
+typename Impl::ViewSubview< View<D,A1,A2,A3,S>
+                          , ArgType0 , void , void , void
+                          , void , void , void , void
+                          >::type
+subview( const View<D,A1,A2,A3,S> & src ,
+         const ArgType0 & arg0 )
+{
+  typedef typename
+    Impl::ViewSubview< View<D,A1,A2,A3,S>
+                 , ArgType0 , void , void , void
+                 , void , void , void , void
+                 >::type
+      DstViewType ;
+
+  return DstViewType( src, arg0 );
+}
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_ViewDefault.hpp>
+#include <impl/Kokkos_Atomic_View.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else
+
+#include <impl/Kokkos_ViewOffset.hpp>
+#include <impl/Kokkos_ViewSupport.hpp>
+
+#endif /* #if defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
+
+#include <KokkosExp_View.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
diff --git a/lib/kokkos/core/src/Kokkos_hwloc.hpp b/lib/kokkos/core/src/Kokkos_hwloc.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..a0b007f64274e5177e34568c02caf75368087045
--- /dev/null
+++ b/lib/kokkos/core/src/Kokkos_hwloc.hpp
@@ -0,0 +1,140 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_HWLOC_HPP
+#define KOKKOS_HWLOC_HPP
+
+#include <utility>
+
+namespace Kokkos {
+
+/** \brief  Minimal subset of logical 'hwloc' functionality available
+ *          from http://www.open-mpi.org/projects/hwloc/.
+ *
+ *  The calls are NOT thread safe in order to avoid mutexes,
+ *  memory allocations, or other actions which could give the
+ *  runtime system an opportunity to migrate the threads or
+ *  touch allocated memory during the function calls.
+ *
+ *  All calls to these functions should be performed by a thread
+ *  when it has guaranteed exclusive access; e.g., for OpenMP
+ *  within a 'critical' region.
+ */
+namespace hwloc {
+
+/** \brief  Query if hwloc is available */
+bool available();
+
+/** \brief  Query number of available NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a NUMA region.
+ */
+unsigned get_available_numa_count();
+
+/** \brief  Query number of available cores per NUMA regions.
+ *          This will be less than the hardware capacity
+ *          if the MPI process is pinned to a set of cores.
+ */
+unsigned get_available_cores_per_numa();
+
+/** \brief  Query number of available "hard" threads per core; i.e., hyperthreads */
+unsigned get_available_threads_per_core();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Internal functions for binding persistent spawned threads.
+
+namespace Kokkos {
+namespace hwloc {
+
+/** \brief  Recommend mapping of threads onto cores.
+ *
+ * If thread_count == 0 then choose and set a value.
+ * If use_numa_count == 0 then choose and set a value.
+ * If use_cores_per_numa == 0 then choose and set a value.
+ *
+ * Return 0 if asynchronous,
+ * Return 1 if synchronous and threads_coord[0] is process core
+ */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] );
+
+/** \brief  Query core-coordinate of the current thread
+ *          with respect to the core_topology.
+ *
+ *  As long as the thread is running within the 
+ *  process binding the following condition holds.
+ *
+ *  core_coordinate.first  < core_topology.first
+ *  core_coordinate.second < core_topology.second
+ */
+std::pair<unsigned,unsigned> get_this_thread_coordinate();
+
+/** \brief  Bind the current thread to a core. */
+bool bind_this_thread( const std::pair<unsigned,unsigned> );
+
+/** \brief  Bind the current thread to one of the cores in the list.
+ *          Set that entry to (~0,~0) and return the index.
+ *          If binding fails return ~0.
+ */
+unsigned bind_this_thread( const unsigned               coordinate_count ,
+                           std::pair<unsigned,unsigned> coordinate[] );
+
+/** \brief  Unbind the current thread back to the original process binding */
+bool unbind_this_thread();
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_HWLOC_HPP */
+
diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile
new file mode 100755
index 0000000000000000000000000000000000000000..24d8e465ff96a7583cb487c5514ef4c235980232
--- /dev/null
+++ b/lib/kokkos/core/src/Makefile
@@ -0,0 +1,118 @@
+KOKKOS_PATH = ../..
+
+PREFIX ?= /usr/local/lib/kokkos
+
+default: messages build-lib
+	echo "End Build"
+	
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = nvcc_wrapper
+	CXXFLAGS ?= -O3
+	LINK = nvcc_wrapper
+	LINKFLAGS ?= 
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= g++
+	LINKFLAGS ?=  
+endif
+
+PWD = $(shell pwd)
+
+KOKKOS_HEADERS_INCLUDE = $(wildcard $(KOKKOS_PATH)/core/src/*.hpp)
+KOKKOS_HEADERS_INCLUDE_IMPL = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp)
+KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp)
+KOKKOS_HEADERS_INCLUDE_IMPL += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp)
+KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp)
+
+CONDITIONAL_COPIES =
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp)
+	CONDITIONAL_COPIES += copy-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
+	CONDITIONAL_COPIES += copy-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
+	CONDITIONAL_COPIES += copy-openmp
+endif
+
+messages: 
+	echo "Start Build"
+
+build-makefile-kokkos:
+	rm -f Makefile.kokkos
+	echo "#Global Settings used to generate this library" >> Makefile.kokkos
+	echo "KOKKOS_PATH = $(PREFIX)" >> Makefile.kokkos
+	echo "KOKKOS_DEVICES = $(KOKKOS_DEVICES)" >> Makefile.kokkos
+	echo "KOKKOS_ARCH = $(KOKKOS_ARCH)" >> Makefile.kokkos
+	echo "KOKKOS_DEBUG = $(KOKKOS_DEBUG)" >> Makefile.kokkos
+	echo "KOKKOS_USE_TPLS = $(KOKKOS_USE_TPLS)" >> Makefile.kokkos
+	echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos
+	echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos
+	echo "CXX ?= $(CXX)" >> Makefile.kokkos 
+	echo "" >> Makefile.kokkos  
+	echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos
+	echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos
+	echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos
+	echo "" >> Makefile.kokkos  
+	echo "#Variables used in application Makefiles" >> Makefile.kokkos
+	echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos
+	echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos
+	echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos
+	echo "KOKKOS_LINK_DEPENDS  = $(KOKKOS_LINK_DEPENDS)" >> Makefile.kokkos
+	echo "KOKKOS_LIBS = $(KOKKOS_LIBS)" >> Makefile.kokkos
+	echo "KOKKOS_LDFLAGS = $(KOKKOS_LDFLAGS)" >> Makefile.kokkos
+	sed \
+		-e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \
+		-e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \
+		-e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \
+		-e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \
+		-e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \
+		-e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' Makefile.kokkos \
+		> Makefile.kokkos.tmp
+	mv -f Makefile.kokkos.tmp Makefile.kokkos
+
+build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
+
+mkdir: 
+	mkdir -p $(PREFIX)
+	mkdir -p $(PREFIX)/include
+	mkdir -p $(PREFIX)/lib
+	mkdir -p $(PREFIX)/include/impl
+
+copy-cuda: mkdir
+	mkdir -p $(PREFIX)/include/Cuda
+	cp $(KOKKOS_HEADERS_CUDA) $(PREFIX)/include/Cuda
+	
+copy-threads: mkdir
+	mkdir -p $(PREFIX)/include/Threads
+	cp $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
+
+copy-openmp: mkdir
+	mkdir -p $(PREFIX)/include/OpenMP
+	cp $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP
+
+install: mkdir $(CONDITIONAL_COPIES) build-lib 
+	cp $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
+	cp $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
+	cp Makefile.kokkos $(PREFIX)
+	cp libkokkos.a $(PREFIX)/lib
+	cp KokkosCore_config.h $(PREFIX)/include
+
+  
+
+clean: kokkos-clean
+	rm Makefile.kokkos
+
+
+
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..f8393611e4d10357cd8051e0535a3aa947fd8f99
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp
@@ -0,0 +1,496 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMP_PARALLEL_HPP
+#define KOKKOS_OPENMP_PARALLEL_HPP
+
+#include <omp.h>
+
+#include <Kokkos_Parallel.hpp>
+#include <OpenMP/Kokkos_OpenMPexec.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( iwork );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( typename PType::work_tag() , iwork );
+      }
+    }
+
+public:
+
+  inline
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy )
+    {
+      OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+      OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+#pragma omp parallel
+      {
+        OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+        driver( functor , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() ) );
+      }
+/* END #pragma omp parallel */
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( iwork , update );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( typename PType::work_tag() , iwork , update );
+      }
+    }
+
+public:
+
+  //----------------------------------------
+
+  template< class ViewType >
+  inline
+  ParallelReduce( typename Impl::enable_if<
+                    ( Impl::is_view< ViewType >::value &&
+                      Impl::is_same< typename ViewType::memory_space , HostSpace >::value
+                    ), const FunctorType & >::type functor
+                , const Policy    & policy
+                , const ViewType  & result_view )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
+
+    OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , 0 );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      driver( functor
+            , ValueInit::init( functor , exec.scratch_reduce() )
+            , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
+            );
+    }
+/* END #pragma omp parallel */
+
+    {
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Kokkos::Impl::FunctorFinal<  FunctorType , WorkTag >::final( functor , ptr );
+
+      if ( result_view.ptr_on_device() ) {
+        const int n = ValueTraits::value_count( functor );
+
+        for ( int j = 0 ; j < n ; ++j ) { result_view.ptr_on_device()[j] = ptr[j] ; }
+      }
+    }
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::OpenMP > Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
+  typedef Kokkos::Impl::FunctorValueOps<    FunctorType , WorkTag >  ValueOps ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range
+             , const bool    final )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( iwork , update , final );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range
+             , const bool    final )
+    {
+      const typename PType::member_type work_end = range.end();
+      for ( typename PType::member_type iwork = range.begin() ; iwork < work_end ; ++iwork ) {
+        functor( typename PType::work_tag() , iwork , update , final );
+      }
+    }
+
+public:
+
+  //----------------------------------------
+
+  inline
+  ParallelScan( const FunctorType & functor
+              , const Policy      & policy )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
+
+    OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( functor ) , 0 );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      driver( functor
+            , ValueInit::init( functor , pointer_type( exec.scratch_reduce() ) + ValueTraits::value_count( functor ) )
+            , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
+            , false );
+    }
+/* END #pragma omp parallel */
+
+    {
+      const unsigned thread_count = OpenMPexec::pool_size();
+      const unsigned value_count  = ValueTraits::value_count( functor );
+
+      pointer_type ptr_prev = 0 ;
+
+      for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
+
+        pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
+
+        if ( ptr_prev ) {
+          for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
+          ValueJoin::join( functor , ptr + value_count , ptr );
+        }
+        else {
+          ValueInit::init( functor , ptr );
+        }
+
+        ptr_prev = ptr ;
+      }
+    }
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      driver( functor
+            , ValueOps::reference( pointer_type( exec.scratch_reduce() ) )
+            , typename Policy::WorkRange( policy , exec.pool_rank() , exec.pool_size() )
+            , true );
+    }
+/* END #pragma omp parallel */
+
+  }
+
+  //----------------------------------------
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
+{
+private:
+
+  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > Policy ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type  & member )
+    { functor( member ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename Policy::member_type  & member )
+    { functor( TagType() , member ); }
+
+public:
+
+  inline
+  ParallelFor( const FunctorType & functor ,
+               const Policy      & policy )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
+    OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
+
+    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+    OpenMPexec::resize_scratch( 0 , team_reduce_size + team_shmem_size );
+
+#pragma omp parallel
+    {
+      typename Policy::member_type member( * OpenMPexec::get_thread_omp() , policy , team_shmem_size );
+
+      for ( ; member.valid() ; member.next() ) {
+        ParallelFor::template driver< typename Policy::work_tag >( functor , member );
+      }
+    }
+/* END #pragma omp parallel */
+  }
+
+  void wait() {}
+};
+
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > >
+{
+private:
+
+  typedef Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >         Policy ;
+  typedef typename Policy::work_tag                                  WorkTag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , WorkTag >  ValueInit ;
+  typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , WorkTag >  ValueJoin ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename PType::member_type  & member
+             ,       reference_type update )
+    { functor( member , update ); }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if< ! Impl::is_same< typename PType::work_tag , void >::value ,
+                 const FunctorType & >::type functor
+             , const typename PType::member_type  & member
+             ,       reference_type update )
+    { functor( typename PType::work_tag() , member , update ); }
+
+public:
+
+  inline
+  ParallelReduce( const FunctorType  & functor ,
+                  const Policy       & policy )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+
+    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+    OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
+
+      for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
+        ParallelReduce::template driver< Policy >( functor , member , update );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag , reference_type >  Join ;
+
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        Join::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
+    }
+  }
+
+  template< class ViewType >
+  inline
+  ParallelReduce( const FunctorType  & functor ,
+                  const Policy       & policy ,
+                  const ViewType     & result )
+  {
+    OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
+
+    const size_t team_reduce_size = Policy::member_type::team_reduce_size();
+    const size_t team_shmem_size  = FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() );
+
+    OpenMPexec::resize_scratch( ValueTraits::value_size( functor ) , team_reduce_size + team_shmem_size );
+
+#pragma omp parallel
+    {
+      OpenMPexec & exec = * OpenMPexec::get_thread_omp();
+
+      reference_type update = ValueInit::init( functor , exec.scratch_reduce() );
+
+      for ( typename Policy::member_type member( exec , policy , team_shmem_size ); member.valid() ; member.next() ) {
+        ParallelReduce::template driver< Policy >( functor , member , update );
+      }
+    }
+/* END #pragma omp parallel */
+
+    {
+      const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
+
+      for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
+        ValueJoin::join( functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
+      }
+
+      Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( functor , ptr );
+
+      const int n = ValueTraits::value_count( functor );
+
+      for ( int j = 0 ; j < n ; ++j ) { result.ptr_on_device()[j] = ptr[j] ; }
+    }
+  }
+
+  void wait() {}
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_OPENMP_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..ed98fd2f979af77a70bd4d6b0a44a570be65c40c
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.cpp
@@ -0,0 +1,364 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <limits>
+#include <iostream>
+#include <vector>
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <iostream>
+
+#ifdef KOKKOS_HAVE_OPENMP
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel();
+
+int kokkos_omp_in_critical_region = ( Kokkos::HostSpace::register_in_parallel( kokkos_omp_in_parallel ) , 0 );
+
+KOKKOS_INLINE_FUNCTION
+int kokkos_omp_in_parallel()
+{
+#ifndef __CUDA_ARCH__
+  return omp_in_parallel() && ! kokkos_omp_in_critical_region ;
+#else
+  return 0;
+#endif
+}
+
+bool s_using_hwloc = false;
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+
+namespace Kokkos {
+namespace Impl {
+
+int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
+
+int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
+
+OpenMPexec::Pool OpenMPexec::m_pool;
+
+void OpenMPexec::verify_is_process( const char * const label )
+{
+  if ( omp_in_parallel() ) {
+    std::string msg( label );
+    msg.append( " ERROR: in parallel" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPexec::verify_initialized( const char * const label )
+{
+  if ( 0 == m_pool[0] ) {
+    std::string msg( label );
+    msg.append( " ERROR: not initialized" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void OpenMPexec::clear_scratch()
+{
+#pragma omp parallel
+  {
+    const int rank_rev = m_map_rank[ omp_get_thread_num() ];
+    m_pool.at(rank_rev).clear();
+  }
+/* END #pragma omp parallel */
+}
+
+void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
+{
+  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
+  enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
+
+  const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
+  const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
+
+  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+
+  // Requesting allocation and old allocation is too small:
+
+  const bool allocate = ( old_reduce_size < reduce_size ) ||
+                        ( old_thread_size < thread_size );
+
+  if ( allocate ) {
+    if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
+    if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
+  }
+
+  const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
+  const int    pool_size  = m_pool_topo[0] ;
+
+  if ( allocate ) {
+
+    clear_scratch();
+
+#pragma omp parallel
+    {
+      const int rank_rev = m_map_rank[ omp_get_thread_num() ];
+      const int rank     = pool_size - ( rank_rev + 1 );
+
+      m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
+      new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
+    }
+/* END #pragma omp parallel */
+  }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+int OpenMP::is_initialized()
+{ return 0 != Impl::OpenMPexec::m_pool[0]; }
+
+void OpenMP::initialize( unsigned thread_count ,
+                         unsigned use_numa_count ,
+                         unsigned use_cores_per_numa )
+{
+  // Before any other call to OMP query the maximum number of threads
+  // and save the value for re-initialization unit testing.
+
+  //Using omp_get_max_threads(); is problematic in conjunction with
+  //Hwloc on Intel (essentially an initial call to the OpenMP runtime
+  //without a parallel region before will set a process mask for a single core
+  //The runtime will than bind threads for a parallel region to other cores on the
+  //entering the first parallel region and make the process mask the aggregate of
+  //the thread masks. The intend seems to be to make serial code run fast, if you
+  //compile with OpenMP enabled but don't actually use parallel regions or so
+  //static int omp_max_threads = omp_get_max_threads();
+  int nthreads = 0;
+  #pragma omp parallel
+  {
+    #pragma omp atomic
+    nthreads++;
+  }
+
+  static int omp_max_threads = nthreads;
+
+  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
+
+  bool thread_spawn_failed = false ;
+
+  if ( ! is_initialized ) {
+
+    // Use hwloc thread pinning if concerned with locality.
+    // If spreading threads across multiple NUMA regions.
+    // If hyperthreading is enabled.
+    Impl::s_using_hwloc = hwloc::available() && (
+                            ( 1 < Kokkos::hwloc::get_available_numa_count() ) ||
+                            ( 1 < Kokkos::hwloc::get_available_threads_per_core() ) );
+
+    std::pair<unsigned,unsigned> threads_coord[ Impl::OpenMPexec::MAX_THREAD_COUNT ];
+
+    // If hwloc available then use it's maximum value.
+
+    if ( thread_count == 0 ) {
+      thread_count = Impl::s_using_hwloc
+      ? Kokkos::hwloc::get_available_numa_count() *
+        Kokkos::hwloc::get_available_cores_per_numa() *
+        Kokkos::hwloc::get_available_threads_per_core()
+      : omp_max_threads ;
+    }
+
+    if(Impl::s_using_hwloc)
+      hwloc::thread_mapping( "Kokkos::OpenMP::initialize" ,
+                           false /* do not allow asynchronous */ ,
+                           thread_count ,
+                           use_numa_count ,
+                           use_cores_per_numa ,
+                           threads_coord );
+
+    // Spawn threads:
+
+    omp_set_num_threads( thread_count );
+
+    // Verify OMP interaction:
+    if ( int(thread_count) != omp_get_max_threads() ) {
+      thread_spawn_failed = true ;
+    }
+
+    // Verify spawning and bind threads:
+#pragma omp parallel
+    {
+#pragma omp critical
+      {
+        if ( int(thread_count) != omp_get_num_threads() ) {
+          thread_spawn_failed = true ;
+        }
+
+        // Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
+        // Call to 'new' may not be thread safe as well.
+
+        // Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
+
+        const unsigned omp_rank    = omp_get_thread_num();
+        const unsigned thread_r    = Impl::s_using_hwloc ? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord ) : omp_rank ;
+
+        Impl::OpenMPexec::m_map_rank[ omp_rank ] = thread_r ;
+      }
+/* END #pragma omp critical */
+    }
+/* END #pragma omp parallel */
+
+    if ( ! thread_spawn_failed ) {
+      Impl::OpenMPexec::m_pool_topo[0] = thread_count ;
+      Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
+      Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
+
+      Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+    std::string msg("Kokkos::OpenMP::initialize ERROR");
+
+    if ( is_initialized ) { msg.append(" : already initialized"); }
+    if ( thread_spawn_failed ) { msg.append(" : failed spawning threads"); }
+
+    Kokkos::Impl::throw_runtime_exception(msg);
+  }
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::finalize()
+{
+  Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
+  Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
+
+  Impl::OpenMPexec::clear_scratch();
+
+  Impl::OpenMPexec::m_pool_topo[0] = 0 ;
+  Impl::OpenMPexec::m_pool_topo[1] = 0 ;
+  Impl::OpenMPexec::m_pool_topo[2] = 0 ;
+
+  omp_set_num_threads(1);
+
+  if ( Impl::s_using_hwloc ) {
+    hwloc::unbind_this_thread();
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void OpenMP::print_configuration( std::ostream & s , const bool detail )
+{
+  Impl::OpenMPexec::verify_is_process( "OpenMP::print_configuration" );
+
+  s << "Kokkos::OpenMP" ;
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  s << " KOKKOS_HAVE_OPENMP" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+
+  const unsigned numa_count_       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  s << " hwloc[" << numa_count_ << "x" << cores_per_numa << "x" << threads_per_core << "]"
+    << " hwloc_binding_" << ( Impl::s_using_hwloc ? "enabled" : "disabled" )
+    ;
+#endif
+
+  const bool is_initialized = 0 != Impl::OpenMPexec::m_pool[0] ;
+
+  if ( is_initialized ) {
+    const int numa_count      = Kokkos::Impl::OpenMPexec::m_pool_topo[0] / Kokkos::Impl::OpenMPexec::m_pool_topo[1] ;
+    const int core_per_numa   = Kokkos::Impl::OpenMPexec::m_pool_topo[1] / Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
+    const int thread_per_core = Kokkos::Impl::OpenMPexec::m_pool_topo[2] ;
+
+    s << " thread_pool_topology[ " << numa_count
+      << " x " << core_per_numa
+      << " x " << thread_per_core
+      << " ]"
+      << std::endl ;
+
+    if ( detail ) {
+      std::vector< std::pair<unsigned,unsigned> > coord( Kokkos::Impl::OpenMPexec::m_pool_topo[0] );
+
+#pragma omp parallel
+      {
+#pragma omp critical
+        {
+          coord[ omp_get_thread_num() ] = hwloc::get_this_thread_coordinate();
+        }
+/* END #pragma omp critical */
+      }
+/* END #pragma omp parallel */
+
+      for ( unsigned i = 0 ; i < coord.size() ; ++i ) {
+        s << "  thread omp_rank[" << i << "]"
+          << " kokkos_rank[" << Impl::OpenMPexec::m_map_rank[ i ] << "]"
+          << " hwloc_coord[" << coord[i].first << "." << coord[i].second << "]"
+          << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+} // namespace Kokkos
+
+#endif //KOKKOS_HAVE_OPENMP
diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..1ab08f648d42a01f81dfdc3d890d5d06fa974f29
--- /dev/null
+++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMPexec.hpp
@@ -0,0 +1,767 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_OPENMPEXEC_HPP
+#define KOKKOS_OPENMPEXEC_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  Data for OpenMP thread execution */
+
+class OpenMPexec {
+public:
+
+  enum { MAX_THREAD_COUNT = 4096 };
+
+  struct Pool
+  {
+    Pool() : m_trackers() {}
+
+    AllocationTracker m_trackers[ MAX_THREAD_COUNT ];
+
+    OpenMPexec * operator[](int i)
+    {
+      return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr());
+    }
+
+    AllocationTracker & at(int i)
+    {
+      return m_trackers[i];
+    }
+  };
+
+private:
+
+  static int          m_pool_topo[ 4 ];
+  static int          m_map_rank[ MAX_THREAD_COUNT ];
+  static Pool         m_pool; // Indexed by: m_pool_rank_rev
+
+  friend class Kokkos::OpenMP ;
+
+  int const  m_pool_rank ;
+  int const  m_pool_rank_rev ;
+  int const  m_scratch_exec_end ;
+  int const  m_scratch_reduce_end ;
+  int const  m_scratch_thread_end ;
+
+  int volatile  m_barrier_state ;
+
+  OpenMPexec();
+  OpenMPexec( const OpenMPexec & );
+  OpenMPexec & operator = ( const OpenMPexec & );
+
+  static void clear_scratch();
+
+public:
+
+  // Topology of a cache coherent thread pool:
+  //   TOTAL = NUMA x GRAIN
+  //   pool_size( depth = 0 )
+  //   pool_size(0) = total number of threads
+  //   pool_size(1) = number of threads per NUMA
+  //   pool_size(2) = number of threads sharing finest grain memory hierarchy
+
+  inline static
+  int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
+
+  inline static
+  OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
+
+  inline int pool_rank() const { return m_pool_rank ; }
+  inline int pool_rank_rev() const { return m_pool_rank_rev ; }
+
+  inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
+  inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
+
+  inline
+  void state_wait( int state )
+    { Impl::spinwait( m_barrier_state , state ); }
+
+  inline
+  void state_set( int state ) { m_barrier_state = state ; }
+
+  ~OpenMPexec() {}
+
+  OpenMPexec( const int poolRank
+            , const int scratch_exec_size
+            , const int scratch_reduce_size
+            , const int scratch_thread_size )
+    : m_pool_rank( poolRank )
+    , m_pool_rank_rev( pool_size() - ( poolRank + 1 ) )
+    , m_scratch_exec_end( scratch_exec_size )
+    , m_scratch_reduce_end( m_scratch_exec_end   + scratch_reduce_size )
+    , m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size )
+    , m_barrier_state(0)
+    {}
+
+  static void finalize();
+
+  static void initialize( const unsigned  team_count ,
+                          const unsigned threads_per_team ,
+                          const unsigned numa_count ,
+                          const unsigned cores_per_numa );
+
+  static void verify_is_process( const char * const );
+  static void verify_initialized( const char * const );
+
+  static void resize_scratch( size_t reduce_size , size_t thread_size );
+
+  inline static
+  OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class OpenMPexecTeamMember {
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  /** \brief  Thread states for team synchronization */
+  enum { Active = 0 , Rendezvous = 1 };
+
+  typedef Kokkos::OpenMP                         execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+  Impl::OpenMPexec    & m_exec ;
+  scratch_memory_space  m_team_shared ;
+  int                   m_team_shmem ;
+  int                   m_team_base_rev ;
+  int                   m_team_rank_rev ;
+  int                   m_team_rank ;
+  int                   m_team_size ;
+  int                   m_league_rank ;
+  int                   m_league_end ;
+  int                   m_league_size ;
+
+  // Fan-in team threads, root of the fan-in which does not block returns true
+  inline
+  bool team_fan_in() const
+    {
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+        m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
+      }
+
+      if ( m_team_rank_rev ) {
+        m_exec.state_set( Rendezvous );
+        m_exec.state_wait( Rendezvous );
+      }
+
+      return 0 == m_team_rank_rev ;
+    }
+
+  inline
+  void team_fan_out() const
+    {
+      for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
+        m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
+      }
+    }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {}
+#else
+    {
+      if ( 1 < m_team_size ) {
+        team_fan_in();
+        team_fan_out();
+      }
+    }
+#endif
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(ValueType& value, const int& thread_id) const
+  {
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { }
+#else
+    // Make sure there is enough scratch space:
+    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
+                         , ValueType , void >::type type ;
+
+    type * const local_value = ((type*) m_exec.scratch_thread());
+    if(team_rank() == thread_id)
+      *local_value = value;
+    memory_fence();
+    team_barrier();
+    value = *local_value;
+#endif
+  }
+
+#ifdef KOKKOS_HAVE_CXX11
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION ValueType
+    team_reduce( const ValueType & value
+               , const JoinOp & op_in ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ValueType(); }
+  #else
+    {
+      typedef ValueType value_type;
+      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
+  #endif
+#else // KOKKOS_HAVE_CXX11
+  template< class JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+    team_reduce( const typename JoinOp::value_type & value
+               , const JoinOp & op ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return typename JoinOp::value_type(); }
+  #else
+    {
+      typedef typename JoinOp::value_type value_type;
+  #endif
+#endif // KOKKOS_HAVE_CXX11
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      type * const local_value = ((type*) m_exec.scratch_thread());
+
+      // Set this thread's contribution
+      *local_value = value ;
+
+      // Fence to make sure the base team member has access:
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        type * const team_value  = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
+
+        // Join to the team value:
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
+        }
+
+        // The base team member may "lap" the other team members,
+        // copy to their local value before proceeding.
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ;
+        }
+
+        // Fence to make sure all team members have access
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *((type volatile const *)local_value);
+    }
+#endif
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ArgType(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
+
+      volatile type * const work_value  = ((type*) m_exec.scratch_thread());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
+          const type offset = accum ;
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+    { return this-> template team_scan<Type>( value , 0 ); }
+
+  //----------------------------------------
+  // Private for the driver
+
+private:
+
+  typedef execution_space::scratch_memory_space space ;
+
+public:
+
+  template< class Arg0 , class Arg1 >
+  inline
+  OpenMPexecTeamMember( Impl::OpenMPexec & exec
+                      , const TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP > & team
+                      , const int shmem_size
+                      )
+    : m_exec( exec )
+    , m_team_shared(0,0)
+    , m_team_shmem( shmem_size )
+    , m_team_base_rev(0)
+    , m_team_rank_rev(0)
+    , m_team_rank(0)
+    , m_team_size( team.team_size() )
+    , m_league_rank(0)
+    , m_league_end(0)
+    , m_league_size( team.league_size() )
+    {
+      const int pool_rank_rev        = m_exec.pool_rank_rev();
+      const int pool_team_rank_rev   = pool_rank_rev % team.team_alloc();
+      const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
+      const int league_iter_end      = team.league_size() - pool_league_rank_rev * team.team_iter();
+
+      if ( pool_team_rank_rev < m_team_size && 0 < league_iter_end ) {
+        m_team_base_rev  = team.team_alloc() * pool_league_rank_rev ;
+        m_team_rank_rev  = pool_team_rank_rev ;
+        m_team_rank      = m_team_size - ( m_team_rank_rev + 1 );
+        m_league_end     = league_iter_end ;
+        m_league_rank    = league_iter_end > team.team_iter() ? league_iter_end - team.team_iter() : 0 ;
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+      }
+    }
+
+  bool valid() const
+    { return m_league_rank < m_league_end ; }
+
+  void next()
+    {
+      if ( ++m_league_rank < m_league_end ) {
+        team_barrier();
+        new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem );
+      }
+    }
+
+  static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+};
+
+
+
+} // namespace Impl
+
+template< class Arg0 , class Arg1 >
+class TeamPolicy< Arg0 , Arg1 , Kokkos::OpenMP >
+{
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicy      execution_policy ;
+
+  //! Execution space of this execution policy.
+  typedef Kokkos::OpenMP  execution_space ;
+
+  typedef typename
+    Impl::if_c< ! Impl::is_same< Kokkos::OpenMP , Arg0 >::value , Arg0 , Arg1 >::type
+      work_tag ;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return execution_space::thread_pool_size(1); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & )
+    { return execution_space::thread_pool_size(2); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& )
+    { return execution_space::thread_pool_size(2); }
+
+  //----------------------------------------
+
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+  int m_team_iter ;
+
+  inline void init( const int league_size_request
+                  , const int team_size_request )
+    {
+      const int pool_size  = execution_space::thread_pool_size(0);
+      const int team_max   = execution_space::thread_pool_size(1);
+      const int team_grain = execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+
+      // Maxumum number of iterations each team will take:
+      m_team_iter  = ( m_league_size + team_count - 1 ) / team_count ;
+    }
+
+public:
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1)
+    { init( league_size_request , team_size_request ); (void) vector_length_request; }
+
+  TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
+    { init( league_size_request , team_size_request ); (void) vector_length_request; }
+
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int team_iter()  const { return m_team_iter ; }
+
+  typedef Impl::OpenMPexecTeamMember member_type ;
+};
+
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline
+int OpenMP::thread_pool_size( int depth )
+{
+  return Impl::OpenMPexec::pool_size(depth);
+}
+
+KOKKOS_INLINE_FUNCTION
+int OpenMP::thread_pool_rank()
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  return Impl::OpenMPexec::m_map_rank[ omp_get_thread_num() ];
+#else
+  return -1 ;
+#endif
+}
+
+} // namespace Kokkos
+
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>
+  TeamThreadRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>
+  TeamThreadRange(const Impl::OpenMPexecTeamMember& thread, const iType& begin, const iType& end) {
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>(thread,begin,end);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >
+  ThreadVectorRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread);
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,join);
+}
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+}
+
+#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..d8b40943deb6264f96a787bdd661534ca1372c8d
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.cpp
@@ -0,0 +1,484 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_QTHREAD )
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <iostream>
+#include <sstream>
+#include <utility>
+#include <Kokkos_Qthread.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+// Defines to enable experimental Qthread functionality
+
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread/qthread.h>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
+
+/** s_exec is indexed by the reverse rank of the workers
+ *  for faster fan-in / fan-out lookups
+ *  [ n - 1 , n - 2 , ... , 0 ]
+ */
+QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
+
+int  s_number_shepherds            = 0 ;
+int  s_number_workers_per_shepherd = 0 ;
+int  s_number_workers              = 0 ;
+
+inline
+QthreadExec ** worker_exec()
+{
+  return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
+}
+
+const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
+
+int s_worker_reduce_end   = 0 ; /* End of worker reduction memory    */
+int s_worker_shared_end   = 0 ; /* Total of worker scratch memory    */
+int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
+
+QthreadExecFunctionPointer volatile s_active_function = 0 ;
+const void               * volatile s_active_function_arg = 0 ;
+
+} /* namespace */
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+void Qthread::initialize( int thread_count )
+{
+  // Environment variable: QTHREAD_NUM_SHEPHERDS
+  // Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
+  // Environment variable: QTHREAD_HWPAR
+
+  {
+    char buffer[256];
+    snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
+    putenv(buffer);
+  }
+
+  const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
+                       ( thread_count    == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
+                       ( thread_count    == qthread_num_workers() );
+
+  bool ok_symmetry = true ;
+
+  if ( ok_init ) {
+    Impl::s_number_shepherds            = qthread_num_shepherds();
+    Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
+    Impl::s_number_workers              = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
+
+    for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
+      ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
+    }
+  }
+
+  if ( ! ok_init || ! ok_symmetry ) {
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
+    msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
+    msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
+    msg << " : qthread_num_workers = " << qthread_num_workers();
+
+    if ( ! ok_symmetry ) {
+      msg << " : qthread_num_workers_local = {" ;
+      for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
+        msg << " " << qthread_num_workers_local(i) ;
+      }
+      msg << " }" ;
+    }
+
+    Impl::s_number_workers   = 0 ;
+    Impl::s_number_shepherds = 0 ;
+    Impl::s_number_workers_per_shepherd = 0 ;
+
+    if ( ok_init ) { qthread_finalize(); }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+}
+
+void Qthread::finalize()
+{
+  Impl::QthreadExec::clear_workers();
+
+  if ( Impl::s_number_workers ) {
+    qthread_finalize();
+  }
+
+  Impl::s_number_workers    = 0 ;
+  Impl::s_number_shepherds  = 0 ;
+  Impl::s_number_workers_per_shepherd = 0 ;
+}
+
+void Qthread::print_configuration( std::ostream & s , const bool detail )
+{
+  s << "Kokkos::Qthread {"
+    << " num_shepherds(" << Impl::s_number_shepherds << ")"
+    << " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
+    << " }" << std::endl ;
+}
+
+Qthread & Qthread::instance( int )
+{
+  static Qthread q ;
+  return q ;
+}
+
+void Qthread::fence()
+{
+}
+
+int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
+int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+aligned_t driver_exec_all( void * arg )
+{
+  QthreadExec & exec = **worker_exec();
+
+  (*s_active_function)( exec , s_active_function_arg );
+
+/*
+  fprintf( stdout
+         , "QthreadExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , exec.worker_rank()
+         , exec.worker_size()
+         , exec.shepherd_rank()
+         , exec.shepherd_size()
+         , exec.shepherd_worker_rank()
+         , exec.shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  return 0 ;
+}
+
+aligned_t driver_resize_worker_scratch( void * arg )
+{
+  static volatile int lock_begin = 0 ;
+  static volatile int lock_end   = 0 ;
+
+  QthreadExec ** const exec = worker_exec();
+
+  //----------------------------------------
+  // Serialize allocation for thread safety
+
+  while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
+
+  const bool ok = 0 == *exec ;
+
+  if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
+
+  lock_begin = 0 ; // release lock
+
+  if ( ok ) { new( *exec ) QthreadExec(); }
+
+  //----------------------------------------
+  // Wait for all calls to complete to insure that each worker has executed.
+
+  if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
+
+  while ( lock_end );
+
+/*
+  fprintf( stdout
+         , "QthreadExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
+         , (**exec).worker_rank()
+         , (**exec).worker_size()
+         , (**exec).shepherd_rank()
+         , (**exec).shepherd_size()
+         , (**exec).shepherd_worker_rank()
+         , (**exec).shepherd_worker_size()
+         );
+  fflush(stdout);
+*/
+
+  //----------------------------------------
+
+  if ( ! ok ) {
+    fprintf( stderr , "Kokkos::QthreadExec resize failed\n" );
+    fflush( stderr );
+  }
+
+  return 0 ;
+}
+
+void verify_is_process( const char * const label , bool not_active = false )
+{
+  const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
+  const bool is_active   = not_active && ( s_active_function || s_active_function_arg );
+
+  if ( not_process || is_active ) {
+    std::string msg( label );
+    msg.append( " : FAILED" );
+    if ( not_process ) msg.append(" : not called by main process");
+    if ( is_active )   msg.append(" : parallel execution in progress");
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+}
+
+int QthreadExec::worker_per_shepherd()
+{
+  return s_number_workers_per_shepherd ;
+}
+
+QthreadExec::QthreadExec()
+{
+  const int shepherd_rank        = qthread_shep();
+  const int shepherd_worker_rank = qthread_worker_local(NULL);
+  const int worker_rank          = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
+
+  m_worker_base          = s_exec ;
+  m_shepherd_base        = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
+  m_scratch_alloc        = ( (unsigned char *) this ) + s_base_size ;
+  m_reduce_end           = s_worker_reduce_end ;
+  m_shepherd_rank        = shepherd_rank ;
+  m_shepherd_size        = s_number_shepherds ;
+  m_shepherd_worker_rank = shepherd_worker_rank ;
+  m_shepherd_worker_size = s_number_workers_per_shepherd ;
+  m_worker_rank          = worker_rank ;
+  m_worker_size          = s_number_workers ;
+  m_worker_state         = QthreadExec::Active ;
+}
+
+void QthreadExec::clear_workers()
+{
+  for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
+    QthreadExec * const exec = s_exec[iwork] ;
+    s_exec[iwork] = 0 ;
+    free( exec );
+  }
+}
+
+void QthreadExec::shared_reset( Qthread::scratch_memory_space & space )
+{
+  new( & space )
+    Qthread::scratch_memory_space(
+      ((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin ,
+      s_worker_shared_end - s_worker_shared_begin
+    );
+}
+
+void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
+{
+  const int exec_all_reduce_alloc = align_alloc( reduce_size );
+  const int shepherd_scan_alloc   = align_alloc( 8 );
+  const int shepherd_shared_end   = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
+
+  if ( s_worker_reduce_end < exec_all_reduce_alloc ||
+       s_worker_shared_end < shepherd_shared_end ) {
+
+/*
+  fprintf( stdout , "QthreadExec::resize\n");
+  fflush(stdout);
+*/
+
+    // Clear current worker memory before allocating new worker memory
+    clear_workers();
+
+    // Increase the buffers to an aligned allocation
+    s_worker_reduce_end   = exec_all_reduce_alloc ;
+    s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
+    s_worker_shared_end   = shepherd_shared_end ;
+
+    // Need to query which shepherd this main 'process' is running...
+ 
+    const int main_shep = qthread_shep();
+
+    // Have each worker resize its memory for proper first-touch
+#if 0
+    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+    for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
+      qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
+    }}
+#else
+    // If this function is used before the 'qthread.task_policy' unit test
+    // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
+    for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+      const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
+
+      if ( num_clone ) {
+        const int ret = qthread_fork_clones_to_local_priority
+          ( driver_resize_worker_scratch   /* function */
+          , NULL                           /* function data block */
+          , NULL                           /* pointer to return value feb */
+          , jshep                          /* shepherd number */
+          , num_clone - 1                  /* number of instances - 1 */
+          );
+
+        assert(ret == QTHREAD_SUCCESS);
+      }
+    }
+#endif
+
+    driver_resize_worker_scratch( NULL );
+
+    // Verify all workers allocated
+
+    bool ok = true ;
+    for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
+
+    if ( ! ok ) {
+      std::ostringstream msg ;
+      msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
+      for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
+         if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
+      }
+      msg << " }" ;
+      Kokkos::Impl::throw_runtime_exception( msg.str() );
+    }
+  }
+}
+
+void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
+{
+  verify_is_process("QthreadExec::exec_all(...)",true);
+
+/*
+  fprintf( stdout , "QthreadExec::exec_all\n");
+  fflush(stdout);
+*/
+
+  s_active_function     = func ;
+  s_active_function_arg = arg ;
+
+  // Need to query which shepherd this main 'process' is running...
+ 
+  const int main_shep = qthread_shep();
+
+#if 0
+  for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+  for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
+    qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
+  }}
+#else
+  // If this function is used before the 'qthread.task_policy' unit test
+  // the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
+  for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
+    const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
+
+    if ( num_clone ) {
+      const int ret = qthread_fork_clones_to_local_priority
+        ( driver_exec_all   /* function */
+        , NULL              /* function data block */
+        , NULL              /* pointer to return value feb */
+        , jshep             /* shepherd number */
+        , num_clone - 1     /* number of instances - 1 */
+        );
+
+      assert(ret == QTHREAD_SUCCESS);
+    }
+  }
+#endif
+
+  driver_exec_all( NULL );
+
+  s_active_function     = 0 ;
+  s_active_function_arg = 0 ;
+}
+
+void * QthreadExec::exec_all_reduce_result()
+{
+  return s_exec[0]->m_scratch_alloc ;
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Impl {
+
+QthreadTeamPolicyMember::QthreadTeamPolicyMember()
+  : m_exec( **worker_exec() )
+  , m_team_shared(0,0)
+  , m_team_size( 1 ) // s_number_workers_per_shepherd )
+  , m_team_rank( 0 ) // m_exec.shepherd_worker_rank() )
+  , m_league_size(1)
+  , m_league_end(1)
+  , m_league_rank(0)
+{
+  m_exec.shared_reset( m_team_shared );
+}
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..365883685772e89d8d32f9dfbfe79d34c746a9aa
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/Kokkos_QthreadExec.hpp
@@ -0,0 +1,614 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREADEXEC_HPP
+#define KOKKOS_QTHREADEXEC_HPP
+
+#include <impl/Kokkos_spinwait.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+class QthreadExec ;
+
+typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
+
+class QthreadExec {
+private:
+
+  enum { Inactive = 0 , Active = 1 };
+
+  const QthreadExec * const * m_worker_base ;
+  const QthreadExec * const * m_shepherd_base ;
+
+  void  * m_scratch_alloc ;  ///< Scratch memory [ reduce , team , shared ]
+  int     m_reduce_end ;     ///< End of scratch reduction memory
+
+  int     m_shepherd_rank ;
+  int     m_shepherd_size ;
+
+  int     m_shepherd_worker_rank ;
+  int     m_shepherd_worker_size ;
+
+  /*
+   *  m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
+   *  m_worker_size = m_shepherd_size * m_shepherd_worker_size
+   */
+  int     m_worker_rank ;
+  int     m_worker_size ;
+
+  int mutable volatile m_worker_state ;
+
+
+  friend class Kokkos::Qthread ;
+
+  ~QthreadExec();
+  QthreadExec( const QthreadExec & );
+  QthreadExec & operator = ( const QthreadExec & );
+
+public:
+
+  QthreadExec();
+
+  /** Execute the input function on all available Qthread workers */
+  static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
+
+  //----------------------------------------
+  /** Barrier across all workers participating in the 'exec_all' */
+  void exec_all_barrier() const
+    {
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  /** Barrier across workers within the shepherd with rank < team_rank */
+  void shepherd_barrier( const int team_size ) const
+    {
+      if ( m_shepherd_worker_rank < team_size ) {
+
+        const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+        int n , j ;
+
+        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+          Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+        }
+
+        if ( rev_rank ) {
+          m_worker_state = QthreadExec::Inactive ;
+          Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+        }
+    
+        for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+          m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+        }
+      }
+    }
+
+  //----------------------------------------
+  /** Reduce across all workers participating in the 'exec_all' */
+  template< class FunctorType , class ArgTag >
+  inline
+  void exec_all_reduce( const FunctorType & func ) const
+    {
+      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
+
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        const QthreadExec & fan = *m_worker_base[j];
+
+        Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
+
+        ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  //----------------------------------------
+  /** Scall across all workers participating in the 'exec_all' */
+  template< class FunctorType , class ArgTag >
+  inline
+  void exec_all_scan( const FunctorType & func ) const
+    {
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > ValueInit ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > ValueJoin ;
+      typedef Kokkos::Impl::FunctorValueOps<    FunctorType , ArgTag > ValueOps ;
+
+      const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        // Root thread scans across values before releasing threads
+        // Worker data is in reverse order, so m_worker_base[0] is the 
+        // highest ranking thread.
+
+        // Copy from lower ranking to higher ranking worker.
+        for ( int i = 1 ; i < m_worker_size ; ++i ) {
+          ValueOps::copy( func
+                        , m_worker_base[i-1]->m_scratch_alloc
+                        , m_worker_base[i]->m_scratch_alloc
+                        );
+        }
+
+        ValueInit::init( func , m_worker_base[m_worker_size-1]->m_scratch_alloc );
+
+        // Join from lower ranking to higher ranking worker.
+        // Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
+        for ( int i = m_worker_size - 1 ; --i ; ) {
+          ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
+        }
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
+        m_worker_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+    }
+
+  //----------------------------------------
+
+  template< class Type>
+  inline
+  volatile Type * shepherd_team_scratch_value() const
+    { return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
+
+  template< class Type >
+  inline
+  void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const
+    {
+      if ( m_shepherd_base ) {
+        Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; }
+        memory_fence();
+        shepherd_barrier( team_size );
+        value = *shared_value ;
+      }
+    }
+
+  template< class Type >
+  inline
+  Type shepherd_reduce( const int team_size , const Type & value ) const
+    {
+      *shepherd_team_scratch_value<Type>() = value ;
+
+      memory_fence();
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        for ( int i = 1 ; i < n ; ++i ) {
+          accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+        }
+        for ( int i = 1 ; i < n ; ++i ) {
+          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
+        }
+
+        memory_fence();
+      }
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+
+      return *shepherd_team_scratch_value<Type>();
+    }
+
+  template< class JoinOp >
+  inline
+  typename JoinOp::value_type
+    shepherd_reduce( const int team_size
+                   , const typename JoinOp::value_type & value
+                   , const JoinOp & op ) const
+    {
+      typedef typename JoinOp::value_type Type ;
+
+      *shepherd_team_scratch_value<Type>() = value ;
+
+      memory_fence();
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        for ( int i = 1 ; i < team_size ; ++i ) {
+          op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
+        }
+        for ( int i = 1 ; i < team_size ; ++i ) {
+          * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
+        }
+
+        memory_fence();
+      }
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+
+      return *shepherd_team_scratch_value<Type>();
+    }
+
+  template< class Type >
+  inline
+  Type shepherd_scan( const int team_size
+                    , const Type & value
+                    ,       Type * const global_value = 0 ) const
+    {
+      *shepherd_team_scratch_value<Type>() = value ;
+
+      memory_fence();
+
+      const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
+
+      int n , j ;
+
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_worker_state = QthreadExec::Inactive ;
+        Impl::spinwait( m_worker_state , QthreadExec::Inactive );
+      }
+      else {
+        // Root thread scans across values before releasing threads
+        // Worker data is in reverse order, so m_shepherd_base[0] is the 
+        // highest ranking thread.
+
+        // Copy from lower ranking to higher ranking worker.
+
+        Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
+        for ( int i = 1 ; i < team_size ; ++i ) {
+          const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+          accum += tmp ;
+          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
+        }
+
+        * m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
+          global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
+
+        // Join from lower ranking to higher ranking worker.
+        for ( int i = team_size ; --i ; ) {
+          * m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
+        }
+
+        memory_fence();
+      }
+    
+      for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
+        m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
+      }
+
+      return *shepherd_team_scratch_value<Type>();
+    }
+
+  //----------------------------------------
+
+  static inline
+  int align_alloc( int size )
+    {
+      enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
+      enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
+      return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
+    }
+
+  void shared_reset( Qthread::scratch_memory_space & );
+
+  void * exec_all_reduce_value() const { return m_scratch_alloc ; }
+
+  static void * exec_all_reduce_result();
+
+  static void resize_worker_scratch( const int reduce_size , const int shared_size );
+  static void clear_workers();
+
+  //----------------------------------------
+
+  inline int worker_rank() const { return m_worker_rank ; }
+  inline int worker_size() const { return m_worker_size ; }
+  inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
+  inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
+  inline int shepherd_rank() const { return m_shepherd_rank ; }
+  inline int shepherd_size() const { return m_shepherd_size ; }
+
+  static int worker_per_shepherd();
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class QthreadTeamPolicyMember {
+private:
+
+  typedef Kokkos::Qthread                        execution_space ;
+  typedef execution_space::scratch_memory_space  scratch_memory_space ;
+
+
+        Impl::QthreadExec   & m_exec ;
+  scratch_memory_space        m_team_shared ;
+  const int                   m_team_size ;
+  const int                   m_team_rank ;
+  const int                   m_league_size ;
+  const int                   m_league_end ;
+        int                   m_league_rank ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  const scratch_memory_space & team_shmem() const { return m_team_shared ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    {}
+#else
+    { m_exec.shepherd_barrier( m_team_size ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value , int rank ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_broadcast<Type>( value , m_team_size , rank ); }
+#endif
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_reduce<Type>( m_team_size , value ); }
+#endif
+
+  template< typename JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+    team_reduce( const typename JoinOp::value_type & value
+               , const JoinOp & op ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return typename JoinOp::value_type(); }
+#else
+    { return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    { return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
+#endif
+
+  //----------------------------------------
+  // Private driver for task-team parallel
+
+  QthreadTeamPolicyMember();
+
+  //----------------------------------------
+  // Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
+
+  // Initialize
+  template< class Arg0 , class Arg1 >
+  QthreadTeamPolicyMember( Impl::QthreadExec & exec , const TeamPolicy<Arg0,Arg1,Qthread> & team )
+    : m_exec( exec )
+    , m_team_shared(0,0)
+    , m_team_size(   team.m_team_size )
+    , m_team_rank(   exec.shepherd_worker_rank() )
+    , m_league_size( team.m_league_size )
+    , m_league_end(  team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
+    , m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
+  {
+    m_exec.shared_reset( m_team_shared );
+  }
+
+  // Continue
+  operator bool () const { return m_league_rank < m_league_end ; }
+
+  // iterate
+  void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); }
+};
+
+} // namespace Impl
+
+template< class Arg0 , class Arg1 >
+class TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread >
+{
+private:
+
+  const int m_league_size ;
+  const int m_team_size ;
+  const int m_shepherd_iter ;
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicy  execution_policy ;
+  typedef Qthread     execution_space ;
+
+  typedef typename
+    Impl::if_c< ! Impl::is_same< Kokkos::Qthread , Arg0 >::value , Arg0 , Arg1 >::type
+      work_tag ;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return Qthread::instance().shepherd_worker_size(); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & f )
+    { return team_size_max( f ); }
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType & f , const int& )
+    { return team_size_max( f ); }
+
+  //----------------------------------------
+
+  inline int team_size()   const { return m_team_size ; }
+  inline int league_size() const { return m_league_size ; }
+
+  // One active team per shepherd
+  TeamPolicy( Kokkos::Qthread & q
+            , const int league_size
+            , const int team_size
+            )
+    : m_league_size( league_size )
+    , m_team_size( team_size < q.shepherd_worker_size()
+                 ? team_size : q.shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
+    {
+    }
+
+  // One active team per shepherd
+  TeamPolicy( const int league_size
+            , const int team_size
+            )
+    : m_league_size( league_size )
+    , m_team_size( team_size < Qthread::instance().shepherd_worker_size()
+                 ? team_size : Qthread::instance().shepherd_worker_size() )
+    , m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
+    {
+    }
+
+  typedef Impl::QthreadTeamPolicyMember member_type ;
+
+  friend class Impl::QthreadTeamPolicyMember ;
+};
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_QTHREADEXEC_HPP */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..dc76a0c42633ad576997f0747b2b934d408d3b70
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_Parallel.hpp
@@ -0,0 +1,643 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
+#define KOKKOS_QTHREAD_PARALLEL_HPP
+
+#include <vector>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <Qthread/Kokkos_QthreadExec.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >  Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i );
+      }
+    }
+
+  // Function is called once by every concurrent thread.
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() ) );
+
+    // All threads wait for completion.
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy
+             )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >  Policy ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i , update );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update );
+      }
+    }
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    driver( self.m_func
+          , ValueInit::init( self.m_func , exec.exec_all_reduce_value() )
+          , typename Policy::WorkRange( self.m_policy , exec.worker_rank() , exec.worker_size() )
+          );
+
+    exec.template exec_all_reduce<FunctorType, typename Policy::work_tag >( self.m_func );
+  }
+
+public:
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & functor
+                , const Policy       & policy
+                , const HostViewType & result_view )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+
+      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
+
+      if ( result_view.ptr_on_device() ) {
+        const unsigned n = ValueTraits::value_count( m_func );
+        for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
+      }
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelFor< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > >
+{
+private:
+
+  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread >  Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_team ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member ) const
+    { m_func( member ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member ) const
+    { m_func( TagType() , member ); }
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    typename Policy::member_type member( exec , self.m_team );
+
+    while ( member ) {
+      self.ParallelFor::template driver< typename Policy::work_tag >( member );
+      member.team_barrier();
+      member.next_team();
+    }
+
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  ParallelFor( const FunctorType & functor ,
+               const Policy      & policy )
+    : m_func( functor )
+    , m_team( policy )
+    {
+      QthreadExec::resize_worker_scratch
+        ( /* reduction   memory */ 0
+        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::execute , this );
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelReduce< FunctorType , TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread > >
+{
+private:
+
+  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Qthread >  Policy ;
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_team ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member
+             , reference_type update ) const
+    { m_func( member , update ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member
+             , reference_type update ) const
+    { m_func( TagType() , member , update ); }
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    // Initialize thread-local value
+    reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
+
+    typename Policy::member_type member( exec , self.m_team );
+
+    while ( member ) {
+      self.ParallelReduce::template driver< typename Policy::work_tag >( member , update );
+      member.team_barrier();
+      member.next_team();
+    }
+
+    exec.template exec_all_reduce< FunctorType , typename Policy::work_tag >( self.m_func );
+  }
+
+public:
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType & functor ,
+                  const Policy      & policy ,
+                  const ViewType    & result )
+    : m_func( functor )
+    , m_team( policy )
+    {
+      QthreadExec::resize_worker_scratch
+        ( /* reduction   memory */ ValueTraits::value_size( functor )
+        , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
+
+      Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_func , data );
+
+      const unsigned n = ValueTraits::value_count( m_func );
+      for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Qthread >  Policy ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename Policy::work_tag > ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , typename Policy::work_tag > ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const bool    final
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i , update , final );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const bool    final
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update , final );
+      }
+    }
+
+  static void execute( QthreadExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+
+    const typename Policy::WorkRange range( self.m_policy , exec.worker_rank() , exec.worker_size() );
+
+    // Initialize thread-local value
+    reference_type update = ValueInit::init( self.m_func , exec.exec_all_reduce_value() );
+
+    driver( self.m_func , update , false , range );
+
+    exec.template exec_all_scan< FunctorType , typename Policy::work_tag >( self.m_func );
+
+    driver( self.m_func , update , true , range );
+
+    exec.exec_all_barrier();
+  }
+
+public:
+
+  ParallelScan( const FunctorType & functor
+              , const Policy      & policy
+              )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_func ) , 0 );
+
+      Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::execute , this );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>
+TeamThreadRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count)
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>
+TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread
+               , const iType & begin
+               , const iType & end
+               )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>(thread,begin,end);
+}
+
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >
+  ThreadVectorRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >(thread,count);
+}
+
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember> PerTeam(const Impl::QthreadTeamPolicyMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::QthreadTeamPolicyMember& thread) {
+  return Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */
+
+} // namespace Kokkos
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+
+} // namespace Kokkos
+
+
+#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..9787d2646296568caca3dccef39d06ee1bbaef55
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
@@ -0,0 +1,451 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_QTHREAD )
+
+#include <stdio.h>
+
+#include <stdlib.h>
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include <Kokkos_Atomic.hpp>
+#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember< Kokkos::Qthread , void , void > Task ;
+
+namespace {
+
+inline
+unsigned padded_sizeof_derived( unsigned sizeof_derived )
+{
+  return sizeof_derived +
+    ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
+}
+
+// int lock_alloc_dealloc = 0 ;
+
+} // namespace
+
+void Task::deallocate( void * ptr )
+{
+  // Counting on 'free' thread safety so lock/unlock not required.
+  // However, isolate calls here to mitigate future need to introduce lock/unlock.
+
+  // lock
+
+  // while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) );
+
+  free( ptr );
+
+  // unlock
+
+  // Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 );
+}
+
+void * Task::allocate( const unsigned arg_sizeof_derived
+                     , const unsigned arg_dependence_capacity )
+{
+  // Counting on 'malloc' thread safety so lock/unlock not required.
+  // However, isolate calls here to mitigate future need to introduce lock/unlock.
+
+  // lock
+
+  // while ( ! Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 0 , 1 ) );
+
+  void * const ptr = malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
+
+  // unlock
+
+  // Kokkos::atomic_compare_exchange_strong( & lock_alloc_dealloc , 1 , 0 );
+
+  return ptr ;
+}
+
+Task::~TaskMember()
+{
+
+}
+
+
+Task::TaskMember( const function_verify_type        arg_verify
+                , const function_dealloc_type       arg_dealloc
+                , const function_apply_single_type  arg_apply_single
+                , const function_apply_team_type    arg_apply_team
+                , volatile int &                    arg_active_count
+                , const unsigned                    arg_sizeof_derived
+                , const unsigned                    arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  arg_verify )
+  , m_apply_single( arg_apply_single )
+  , m_apply_team( arg_apply_team )
+  , m_active_count( & arg_active_count )
+  , m_qfeb(0)
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
+{
+  qthread_empty( & m_qfeb ); // Set to full when complete
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+Task::TaskMember( const function_dealloc_type       arg_dealloc
+                , const function_apply_single_type  arg_apply_single
+                , const function_apply_team_type    arg_apply_team
+                , volatile int &                    arg_active_count
+                , const unsigned                    arg_sizeof_derived
+                , const unsigned                    arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  & Task::verify_type<void> )
+  , m_apply_single( arg_apply_single )
+  , m_apply_team( arg_apply_team )
+  , m_active_count( & arg_active_count )
+  , m_qfeb(0)
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( Kokkos::Experimental::TASK_STATE_CONSTRUCTING )
+{
+  qthread_empty( & m_qfeb ); // Set to full when complete
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void Task::throw_error_add_dependence() const
+{
+  std::cerr << "TaskMember< Qthread >::add_dependence ERROR"
+            << " state(" << m_state << ")"
+            << " dep_size(" << m_dep_size << ")"
+            << std::endl ;
+  throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR");
+}
+
+void Task::throw_error_verify_type()
+{
+  throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR");
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
+{
+  static const char msg_error_header[]      = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ;
+  static const char msg_error_count[]       = ": negative reference count" ;
+  static const char msg_error_complete[]    = ": destroy task that is not complete" ;
+  static const char msg_error_dependences[] = ": destroy task that has dependences" ;
+  static const char msg_error_exception[]   = ": caught internal exception" ;
+
+  if ( rhs ) { Kokkos::atomic_fetch_add( & (*rhs).m_ref_count , 1 ); }
+
+  Task * const lhs_val = Kokkos::atomic_exchange( lhs , rhs );
+
+  if ( lhs_val ) {
+
+    const int count = Kokkos::atomic_fetch_add( & (*lhs_val).m_ref_count , -1 );
+
+    const char * msg_error = 0 ;
+
+    try {
+
+      if ( 1 == count ) {
+
+        // Reference count at zero, delete it
+
+        // Should only be deallocating a completed task
+        if ( (*lhs_val).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) {
+
+          // A completed task should not have dependences...
+          for ( int i = 0 ; i < (*lhs_val).m_dep_size && 0 == msg_error ; ++i ) {
+            if ( (*lhs_val).m_dep[i] ) msg_error = msg_error_dependences ;
+          }
+        }
+        else {
+          msg_error = msg_error_complete ;
+        }
+
+        if ( 0 == msg_error ) {
+          // Get deletion function and apply it
+          const Task::function_dealloc_type d = (*lhs_val).m_dealloc ;
+
+          (*d)( lhs_val );
+        }
+      }
+      else if ( count <= 0 ) {
+        msg_error = msg_error_count ;
+      }
+    }
+    catch( ... ) {
+      if ( 0 == msg_error ) msg_error = msg_error_exception ;
+    }
+
+    if ( 0 != msg_error ) {
+      if ( no_throw ) {
+        std::cerr << msg_error_header << msg_error << std::endl ;
+        std::cerr.flush();
+      }
+      else {
+        std::string msg(msg_error_header);
+        msg.append(msg_error);
+        throw std::runtime_error( msg );
+      }
+    }
+  }
+}
+#endif
+
+
+//----------------------------------------------------------------------------
+
+aligned_t Task::qthread_func( void * arg )
+{
+  Task * const task = reinterpret_cast< Task * >(arg);
+
+  // First member of the team change state to executing.
+  // Use compare-exchange to avoid race condition with a respawn.
+  Kokkos::atomic_compare_exchange_strong( & task->m_state
+                                        , int(Kokkos::Experimental::TASK_STATE_WAITING)
+                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        );
+
+  // It is a single thread's responsibility to close out
+  // this task's execution.
+  bool close_out = false ;
+
+  if ( task->m_apply_team ) {
+
+    Kokkos::Impl::QthreadTeamPolicyMember member ;
+
+    (*task->m_apply_team)( task , member );
+
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       , member.team_rank()
+       , member.team_size()
+       );
+fflush(stdout);
+
+    member.team_barrier();
+
+    close_out = member.team_rank() == 0 ;
+  }
+  else {
+    (*task->m_apply_single)( task );
+
+    close_out = true ;
+  }
+
+  if ( close_out ) {
+
+    // When dependent tasks run there would be a race
+    // condition between destroying this task and
+    // querying the active count pointer from this task.
+    int volatile * active_count = task->m_active_count ;
+
+    if ( task->m_state == ( Kokkos::Experimental::TASK_STATE_WAITING | Kokkos::Experimental::TASK_STATE_EXECUTING ) ) {
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx respawn\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       );
+fflush(stdout);
+#endif
+
+      // Task respawned, set state to waiting and reschedule the task
+      task->m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+      task->schedule();
+    }
+    else {
+
+      // Task did not respawn, is complete
+      task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
+
+      // Release dependences before allowing dependent tasks to run.
+      // Otherwise there is a thread race condition for removing dependences.
+      for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
+        assign( & task->m_dep[i] , 0 );
+      }
+
+      // Set qthread FEB to full so that dependent tasks are allowed to execute.
+      // This 'task' may be deleted immediately following this function call.
+      qthread_fill( & task->m_qfeb );
+    }
+
+    // Decrement active task count before returning.
+    Kokkos::atomic_decrement( active_count );
+  }
+
+#if 0
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx return\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(task)
+       );
+fflush(stdout);
+#endif
+
+  return 0 ;
+}
+
+void Task::respawn()
+{
+  // Change state from pure executing to ( waiting | executing )
+  // to avoid confusion with simply waiting.
+  Kokkos::atomic_compare_exchange_strong( & m_state
+                                        , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        , int(Kokkos::Experimental::TASK_STATE_WAITING |
+                                              Kokkos::Experimental::TASK_STATE_EXECUTING)
+                                        );
+}
+
+void Task::schedule()
+{
+  // Is waiting for execution
+
+  // Increment active task count before spawning.
+  Kokkos::atomic_increment( m_active_count );
+
+  // spawn in qthread.  must malloc the precondition array and give to qthread.
+  // qthread will eventually free this allocation so memory will not be leaked.
+
+  // concern with thread safety of malloc, does this need to be guarded?
+  aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
+
+  qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
+
+  for ( int i = 0 ; i < m_dep_size ; ++i ) {
+    qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
+  }
+
+  if ( m_apply_single ) {
+    qthread_spawn( & Task::qthread_func /* function */
+                 , this                 /* function argument */
+                 , 0
+                 , NULL
+                 , m_dep_size , qprecon /* dependences */
+                 , NO_SHEPHERD
+                 , QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
+                 );
+  }
+  else {
+    // If more than one shepherd spawn on a shepherd other than this shepherd
+    const int num_shepherd            = qthread_num_shepherds();
+    const int num_worker_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
+    const int this_shepherd           = qthread_shep();
+
+    int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;
+
+fprintf( stdout
+       , "worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n"
+       , qthread_shep()
+       , qthread_worker_local(NULL)
+       , reinterpret_cast<unsigned long>(this)
+       , spawn_shepherd
+       , num_worker_per_shepherd - 1
+       );
+fflush(stdout);
+
+    qthread_spawn_cloneable
+      ( & Task::qthread_func
+      , this
+      , 0
+      , NULL
+      , m_dep_size , qprecon /* dependences */
+      , spawn_shepherd
+      // , unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY )
+      , unsigned( QTHREAD_SPAWN_LOCAL_PRIORITY )
+      , num_worker_per_shepherd - 1
+      );
+  }
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Qthread >::member_type &
+TaskPolicy< Kokkos::Qthread >::member_single()
+{
+  static member_type s ;
+  return s ;
+}
+
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
+{
+  volatile int * const active_task_count = & policy.m_active_count ;
+  while ( *active_task_count ) qthread_yield();
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */
+
diff --git a/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..af44b62a1977d59ca20b01ad6d819b654219e688
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/Kokkos_Qthread_TaskPolicy.hpp
@@ -0,0 +1,646 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_QTHREAD_TASKPOLICY_HPP
+#define KOKKOS_QTHREAD_TASKPOLICY_HPP
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+//----------------------------------------------------------------------------
+// Defines to enable experimental Qthread functionality
+
+#define QTHREAD_LOCAL_PRIORITY
+#define CLONED_TASKS
+
+#include <qthread.h>
+
+#undef QTHREAD_LOCAL_PRIORITY
+#undef CLONED_TASKS
+
+//----------------------------------------------------------------------------
+
+#include <Kokkos_Qthread.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_View.hpp>
+
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template<>
+class TaskMember< Kokkos::Qthread , void , void >
+{
+public:
+
+  typedef void         (* function_apply_single_type) ( TaskMember * );
+  typedef void         (* function_apply_team_type)   ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
+  typedef void         (* function_dealloc_type)( TaskMember * );
+  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+
+private:
+
+  const function_dealloc_type       m_dealloc ;       ///< Deallocation
+  const function_verify_type        m_verify ;        ///< Result type verification
+  const function_apply_single_type  m_apply_single ;  ///< Apply function
+  const function_apply_team_type    m_apply_team ;    ///< Apply function
+  int volatile * const              m_active_count ;  ///< Count of active tasks on this policy
+  aligned_t                         m_qfeb ;          ///< Qthread full/empty bit
+  TaskMember ** const               m_dep ;           ///< Dependences
+  const int                         m_dep_capacity ;  ///< Capacity of dependences
+  int                               m_dep_size ;      ///< Actual count of dependences
+  int                               m_ref_count ;     ///< Reference count
+  int                               m_state ;         ///< State of the task
+
+  TaskMember() /* = delete */ ;
+  TaskMember( const TaskMember & ) /* = delete */ ;
+  TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
+
+  static aligned_t qthread_func( void * arg );
+
+  static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
+  static void   deallocate( void * );
+
+  void throw_error_add_dependence() const ;
+  static void throw_error_verify_type();
+
+  template < class DerivedTaskType >
+  static
+  void deallocate( TaskMember * t )
+    {
+      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
+      ptr->~DerivedTaskType();
+      deallocate( (void *) ptr );
+    }
+
+  void schedule();
+
+protected :
+
+  ~TaskMember();
+
+  // Used by TaskMember< Qthread , ResultType , void >
+  TaskMember( const function_verify_type        arg_verify
+            , const function_dealloc_type       arg_dealloc
+            , const function_apply_single_type  arg_apply_single
+            , const function_apply_team_type    arg_apply_team
+            , volatile int &                    arg_active_count
+            , const unsigned                    arg_sizeof_derived
+            , const unsigned                    arg_dependence_capacity
+            );
+
+  // Used for TaskMember< Qthread , void , void >
+  TaskMember( const function_dealloc_type       arg_dealloc
+            , const function_apply_single_type  arg_apply_single
+            , const function_apply_team_type    arg_apply_team
+            , volatile int &                    arg_active_count
+            , const unsigned                    arg_sizeof_derived
+            , const unsigned                    arg_dependence_capacity
+            );
+
+public:
+
+  template< typename ResultType >
+  KOKKOS_FUNCTION static
+  TaskMember * verify_type( TaskMember * t )
+    {
+      enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
+
+      if ( check_type && t != 0 ) {
+
+        // Verify that t->m_verify is this function
+        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
+
+        if ( t->m_verify != self ) {
+          t = 0 ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+          throw_error_verify_type();
+#endif
+        }
+      }
+      return t ;
+    }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *    typedef  FunctorType::value_type  value_type ;
+   *    class DerivedTaskType
+   *      : public TaskMember< Qthread , value_type , FunctorType >
+   *      { ... };
+   *    class TaskMember< Qthread , value_type , FunctorType >
+   *      : public TaskMember< Qthread , value_type , void >
+   *      , public Functor
+   *      { ... };
+   *  If value_type != void
+   *    class TaskMember< Qthread , value_type , void >
+   *      : public TaskMember< Qthread , void , void >
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   *
+   */
+
+  /** \brief  Allocate and construct a single-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_single( const typename DerivedTaskType::functor_type &  arg_functor
+                            , volatile int &                                  arg_active_count
+                            , const unsigned                                  arg_dependence_capacity )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , & TaskMember::template apply_single< functor_type , value_type >
+                         , 0
+                         , arg_active_count
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  /** \brief  Allocate and construct a team-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_team( const typename DerivedTaskType::functor_type &  arg_functor
+                          , volatile int &                                  arg_active_count
+                          , const unsigned                                  arg_dependence_capacity )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , 0
+                         , & TaskMember::template apply_team< functor_type , value_type >
+                         , arg_active_count
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  void respawn();
+  void spawn()
+    {
+       m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+       schedule();
+    }
+
+  //----------------------------------------
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+  //----------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
+#else
+  KOKKOS_INLINE_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  TaskMember * get_dependence( int i ) const
+    { return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence() const
+    { return m_dep_size ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence()
+    {
+      for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
+      m_dep_size = 0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskMember * before )
+    {
+      if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state ||
+             Kokkos::Experimental::TASK_STATE_EXECUTING    == m_state ) &&
+           m_dep_size < m_dep_capacity ) {
+        assign( m_dep + m_dep_size , before );
+        ++m_dep_size ;
+      }
+      else {
+        throw_error_add_dependence();
+      }
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Qthread , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Qthread , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_team( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t
+                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member , m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_team( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t
+                 , Kokkos::Impl::QthreadTeamPolicyMember & member )
+    {
+      typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member );
+    }
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Base class for tasks with a result value in the Qthread execution space.
+ *
+ *  The FunctorType must be void because this class is accessed by the
+ *  Future class for the task and result value.
+ *
+ *  Must be derived from TaskMember<S,void,void> 'root class' so the Future class
+ *  can correctly static_cast from the 'root class' to this class.
+ */
+template < class ResultType >
+class TaskMember< Kokkos::Qthread , ResultType , void >
+  : public TaskMember< Kokkos::Qthread , void , void >
+{
+public:
+
+  ResultType  m_result ;
+
+  typedef const ResultType & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+protected:
+
+  typedef TaskMember< Kokkos::Qthread , void , void >  task_root_type ;
+  typedef task_root_type::function_dealloc_type        function_dealloc_type ;
+  typedef task_root_type::function_apply_single_type   function_apply_single_type ;
+  typedef task_root_type::function_apply_team_type     function_apply_team_type ;
+
+  inline
+  TaskMember( const function_dealloc_type       arg_dealloc
+            , const function_apply_single_type  arg_apply_single
+            , const function_apply_team_type    arg_apply_team
+            , volatile int &                    arg_active_count
+            , const unsigned                    arg_sizeof_derived
+            , const unsigned                    arg_dependence_capacity
+            )
+    : task_root_type( & task_root_type::template verify_type< ResultType >
+                    , arg_dealloc
+                    , arg_apply_single
+                    , arg_apply_team
+                    , arg_active_count
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , m_result()
+    {}
+};
+
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Qthread , ResultType , void >
+  , public FunctorType
+{
+public:
+
+  typedef FunctorType  functor_type ;
+
+  typedef TaskMember< Kokkos::Qthread , void , void >        task_root_type ;
+  typedef TaskMember< Kokkos::Qthread , ResultType , void >  task_base_type ;
+  typedef task_root_type::function_dealloc_type              function_dealloc_type ;
+  typedef task_root_type::function_apply_single_type         function_apply_single_type ;
+  typedef task_root_type::function_apply_team_type           function_apply_team_type ;
+
+  inline
+  TaskMember( const function_dealloc_type       arg_dealloc
+            , const function_apply_single_type  arg_apply_single
+            , const function_apply_team_type    arg_apply_team
+            , volatile int &                    arg_active_count
+            , const unsigned                    arg_sizeof_derived
+            , const unsigned                    arg_dependence_capacity
+            , const functor_type &              arg_functor
+            )
+    : task_base_type( arg_dealloc
+                    , arg_apply_single
+                    , arg_apply_team
+                    , arg_active_count
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , functor_type( arg_functor )
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+void wait( TaskPolicy< Kokkos::Qthread > & );
+
+template<>
+class TaskPolicy< Kokkos::Qthread >
+{
+public:
+
+  typedef Kokkos::Qthread                        execution_space ;
+  typedef Kokkos::Impl::QthreadTeamPolicyMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
+
+  TaskPolicy & operator = ( const TaskPolicy & ) /* = delete */ ;
+
+  template< class FunctorType >
+  static inline
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  static inline
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+  const unsigned  m_default_dependence_capacity ;
+  volatile int    m_active_count_root ;
+  volatile int &  m_active_count ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy()
+    : m_default_dependence_capacity(4)
+    , m_active_count_root(0)
+    , m_active_count( m_active_count_root )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  explicit
+  TaskPolicy( const unsigned arg_default_dependence_capacity )
+    : m_default_dependence_capacity( arg_default_dependence_capacity )
+    , m_active_count_root(0)
+    , m_active_count( m_active_count_root )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy & rhs )
+    : m_default_dependence_capacity( rhs.m_default_dependence_capacity )
+    , m_active_count_root(0)
+    , m_active_count( rhs.m_active_count )
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy & rhs
+            , const unsigned arg_default_dependence_capacity )
+    : m_default_dependence_capacity( arg_default_dependence_capacity )
+    , m_active_count_root(0)
+    , m_active_count( rhs.m_active_count )
+    {}
+
+  //----------------------------------------
+
+  template< class ValueType >
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f ) const
+      {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        f.m_task->spawn();
+#endif
+        return f ;
+      }
+
+  // Create single-thread task
+
+  template< class FunctorType >
+  Future< typename FunctorType::value_type , execution_space >
+  create( const FunctorType & functor
+        , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_single< task_type >
+          ( functor
+          , m_active_count
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          )
+#endif
+        );
+    }
+
+  // Create thread-team task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_team( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_team< task_type >
+          ( functor
+          , m_active_count
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          )
+#endif
+        );
+    }
+
+  // Add dependence
+  template< class A1 , class A2 , class A3 , class A4 >
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      after.m_task->add_dependence( before.m_task );
+#endif
+    }
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        get_task_root(task_functor)->get_dependence(i)
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  int get_dependence( const FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return get_task_root(task_functor)->get_dependence(); }
+#else
+    { return 0 ; }
+#endif
+
+  template< class FunctorType >
+  void clear_dependence( FunctorType * task_functor ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->clear_dependence();
+#endif
+    }
+
+  template< class FunctorType , class A3 , class A4 >
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      get_task_root(task_functor)->add_dependence( before.m_task );
+#endif
+    }
+
+  template< class FunctorType >
+  void respawn( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->respawn(); }
+#else
+    {}
+#endif
+
+  static member_type & member_single();
+
+  friend void wait( TaskPolicy< Kokkos::Qthread > & );
+};
+
+} /* namespace Experimental */
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/Qthread/README b/lib/kokkos/core/src/Qthread/README
new file mode 100755
index 0000000000000000000000000000000000000000..5d8f29a4ee706d813fe344c35d4ad1c96bfbb024
--- /dev/null
+++ b/lib/kokkos/core/src/Qthread/README
@@ -0,0 +1,28 @@
+
+# This Qthreads back-end uses an experimental branch of the Qthreads repository with special #define options.
+
+# Cloning repository and branch:
+
+git clone https://github.com/stelleg/qthreads qthreads-with-clone
+
+cd qthreads-with-clone
+
+# Added to ./git/config
+#
+# [branch "cloned_tasks"]
+#        remote = origin
+#        merge = refs/heads/cloned_tasks
+#
+
+git branch cloned_tasks
+git checkout cloned_tasks
+git pull
+
+sh autogen.sh
+
+# configurure with 'hwloc' installation:
+
+./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}
+
+
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..99553fccb1fae82678b5b6e938a41f08859b0921
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp
@@ -0,0 +1,758 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD )
+
+#include <stdint.h>
+#include <limits>
+#include <utility>
+#include <iostream>
+#include <sstream>
+#include <Kokkos_Threads.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <Kokkos_Atomic.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+ThreadsExec                  s_threads_process ;
+ThreadsExec                * s_threads_exec[  ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
+pthread_t                    s_threads_pid[   ThreadsExec::MAX_THREAD_COUNT ] = { 0 };
+std::pair<unsigned,unsigned> s_threads_coord[ ThreadsExec::MAX_THREAD_COUNT ];
+
+int s_thread_pool_size[3] = { 0 , 0 , 0 };
+
+unsigned s_current_reduce_size = 0 ;
+unsigned s_current_shared_size = 0 ;
+
+void (* volatile s_current_function)( ThreadsExec & , const void * );
+const void * volatile s_current_function_arg = 0 ;
+
+struct Sentinel {
+  Sentinel()
+  {
+    HostSpace::register_in_parallel( ThreadsExec::in_parallel );
+  }
+
+  ~Sentinel()
+  {
+    if ( s_thread_pool_size[0] ||
+         s_thread_pool_size[1] ||
+         s_thread_pool_size[2] ||
+         s_current_reduce_size ||
+         s_current_shared_size ||
+         s_current_function ||
+         s_current_function_arg ||
+         s_threads_exec[0] ) {
+      std::cerr << "ERROR : Process exiting without calling Kokkos::Threads::terminate()" << std::endl ;
+    }
+  }
+};
+
+inline
+unsigned fan_size( const unsigned rank , const unsigned size )
+{
+  const unsigned rank_rev = size - ( rank + 1 );
+  unsigned count = 0 ;
+  for ( unsigned n = 1 ; ( rank_rev + n < size ) && ! ( rank_rev & n ) ; n <<= 1 ) { ++count ; }
+  return count ;
+}
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void execute_function_noop( ThreadsExec & , const void * ) {}
+
+void ThreadsExec::driver(void)
+{
+  ThreadsExec this_thread ;
+
+  while ( ThreadsExec::Active == this_thread.m_pool_state ) {
+
+    (*s_current_function)( this_thread , s_current_function_arg );
+
+    // Deactivate thread and wait for reactivation
+    this_thread.m_pool_state = ThreadsExec::Inactive ;
+
+    wait_yield( this_thread.m_pool_state , ThreadsExec::Inactive );
+  }
+}
+
+ThreadsExec::ThreadsExec()
+  : m_pool_base(0)
+  , m_scratch()
+  , m_scratch_reduce_end(0)
+  , m_scratch_thread_end(0)
+  , m_numa_rank(0)
+  , m_numa_core_rank(0)
+  , m_pool_rank(0)
+  , m_pool_size(0)
+  , m_pool_fan_size(0)
+  , m_pool_state( ThreadsExec::Terminating )
+{
+  if ( & s_threads_process != this ) {
+
+    // A spawned thread
+
+    ThreadsExec * const nil = 0 ;
+
+    // Which entry in 's_threads_exec', possibly determined from hwloc binding
+    const int entry = ((size_t)s_current_function_arg) < size_t(s_thread_pool_size[0])
+                    ? ((size_t)s_current_function_arg)
+                    : size_t(Kokkos::hwloc::bind_this_thread( s_thread_pool_size[0] , s_threads_coord ));
+
+    // Given a good entry set this thread in the 's_threads_exec' array
+    if ( entry < s_thread_pool_size[0] &&
+         nil == atomic_compare_exchange( s_threads_exec + entry , nil , this ) ) {
+
+      const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+      m_numa_rank       = coord.first ;
+      m_numa_core_rank  = coord.second ;
+      m_pool_base       = s_threads_exec ;
+      m_pool_rank       = s_thread_pool_size[0] - ( entry + 1 );
+      m_pool_size       = s_thread_pool_size[0] ;
+      m_pool_fan_size   = fan_size( m_pool_rank , m_pool_size );
+      m_pool_state      = ThreadsExec::Active ;
+
+      s_threads_pid[ m_pool_rank ] = pthread_self();
+
+      // Inform spawning process that the threads_exec entry has been set.
+      s_threads_process.m_pool_state = ThreadsExec::Active ;
+    }
+    else {
+      // Inform spawning process that the threads_exec entry could not be set.
+      s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+    }
+  }
+  else {
+    // Enables 'parallel_for' to execute on unitialized Threads device
+    m_pool_rank  = 0 ;
+    m_pool_size  = 1 ;
+    m_pool_state = ThreadsExec::Inactive ;
+
+    s_threads_pid[ m_pool_rank ] = pthread_self();
+  }
+}
+
+ThreadsExec::~ThreadsExec()
+{
+  const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
+
+  m_pool_base   = 0 ;
+  m_scratch.clear();
+  m_scratch_reduce_end = 0 ;
+  m_scratch_thread_end = 0 ;
+  m_numa_rank      = 0 ;
+  m_numa_core_rank = 0 ;
+  m_pool_rank      = 0 ;
+  m_pool_size      = 0 ;
+  m_pool_fan_size  = 0 ;
+
+  m_pool_state  = ThreadsExec::Terminating ;
+
+  if ( & s_threads_process != this && entry < MAX_THREAD_COUNT ) {
+    ThreadsExec * const nil = 0 ;
+
+    atomic_compare_exchange( s_threads_exec + entry , this , nil );
+
+    s_threads_process.m_pool_state = ThreadsExec::Terminating ;
+  }
+}
+
+
+int ThreadsExec::get_thread_count()
+{
+  return s_thread_pool_size[0] ;
+}
+
+ThreadsExec * ThreadsExec::get_thread( const int init_thread_rank )
+{
+  ThreadsExec * const th =
+    init_thread_rank < s_thread_pool_size[0]
+    ? s_threads_exec[ s_thread_pool_size[0] - ( init_thread_rank + 1 ) ] : 0 ;
+
+  if ( 0 == th || th->m_pool_rank != init_thread_rank ) {
+    std::ostringstream msg ;
+    msg << "Kokkos::Impl::ThreadsExec::get_thread ERROR : "
+        << "thread " << init_thread_rank << " of " << s_thread_pool_size[0] ;
+    if ( 0 == th ) {
+      msg << " does not exist" ;
+    }
+    else {
+      msg << " has wrong thread_rank " << th->m_pool_rank ;
+    }
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  return th ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
+{
+  ThreadsExec::global_lock();
+  ThreadsExec::global_unlock();
+
+  const int n = exec.m_pool_fan_size ;
+  const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
+
+  for ( int i = 0 ; i < n ; ++i ) {
+    Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+  }
+
+  exec.m_pool_state = ThreadsExec::Inactive ;
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void ThreadsExec::verify_is_process( const std::string & name , const bool initialized )
+{
+  if ( ! is_process() ) {
+    std::string msg( name );
+    msg.append( " FAILED : Called by a worker thread, can only be called by the master process." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+
+  if ( initialized && 0 == s_thread_pool_size[0] ) {
+    std::string msg( name );
+    msg.append( " FAILED : Threads not initialized." );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+int ThreadsExec::in_parallel()
+{
+  // A thread function is in execution and
+  // the function argument is not the special threads process argument and
+  // the master process is a worker or is not the master process.
+  return s_current_function &&
+         ( & s_threads_process != s_current_function_arg ) &&
+         ( s_threads_process.m_pool_base || ! is_process() );
+}
+
+// Wait for root thread to become inactive
+void ThreadsExec::fence()
+{
+  if ( s_thread_pool_size[0] ) {
+    // Wait for the root thread to complete:
+    Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
+  }
+
+  s_current_function     = 0 ;
+  s_current_function_arg = 0 ;
+}
+
+/** \brief  Begin execution of the asynchronous functor */
+void ThreadsExec::start( void (*func)( ThreadsExec & , const void * ) , const void * arg )
+{
+  verify_is_process("ThreadsExec::start" , true );
+
+  if ( s_current_function || s_current_function_arg ) {
+    Kokkos::Impl::throw_runtime_exception( std::string( "ThreadsExec::start() FAILED : already executing" ) );
+  }
+
+  s_current_function     = func ;
+  s_current_function_arg = arg ;
+
+  // Activate threads:
+  for ( int i = s_thread_pool_size[0] ; 0 < i-- ; ) {
+    s_threads_exec[i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  if ( s_threads_process.m_pool_size ) {
+    // Master process is the root thread, run it:
+    (*func)( s_threads_process , arg );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::sleep()
+{
+  verify_is_process("ThreadsExec::sleep", true );
+
+  if ( & execute_sleep == s_current_function ) return false ;
+
+  fence();
+
+  ThreadsExec::global_lock();
+
+  s_current_function = & execute_sleep ;
+
+  // Activate threads:
+  for ( unsigned i = s_thread_pool_size[0] ; 0 < i ; ) {
+    s_threads_exec[--i]->m_pool_state = ThreadsExec::Active ;
+  }
+
+  return true ;
+}
+
+bool ThreadsExec::wake()
+{
+  verify_is_process("ThreadsExec::wake", true );
+
+  if ( & execute_sleep != s_current_function ) return false ;
+
+  ThreadsExec::global_unlock();
+
+  if ( s_threads_process.m_pool_base ) {
+    execute_sleep( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  fence();
+
+  return true ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::execute_serial( void (*func)( ThreadsExec & , const void * ) )
+{
+  s_current_function = func ;
+  s_current_function_arg = & s_threads_process ;
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_thread_pool_size[0] ; begin < i ; ) {
+    ThreadsExec & th = * s_threads_exec[ --i ];
+
+    th.m_pool_state = ThreadsExec::Active ;
+
+    wait_yield( th.m_pool_state , ThreadsExec::Active );
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    s_threads_process.m_pool_state = ThreadsExec::Active ;
+    (*func)( s_threads_process , 0 );
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+  }
+
+  s_current_function_arg = 0 ;
+  s_current_function = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void * ThreadsExec::root_reduce_scratch()
+{
+  return s_threads_process.reduce_memory();
+}
+
+void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
+{
+  exec.m_scratch.clear();
+
+  exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
+  exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
+
+  if ( s_threads_process.m_scratch_thread_end ) {
+
+    exec.m_scratch =
+      HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
+
+    unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
+    unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
+
+    // touch on this thread
+    while ( ptr < end ) *ptr++ = 0 ;
+  }
+}
+
+void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
+{
+  enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
+
+  fence();
+
+  const size_t old_reduce_size = s_threads_process.m_scratch_reduce_end ;
+  const size_t old_thread_size = s_threads_process.m_scratch_thread_end - s_threads_process.m_scratch_reduce_end ;
+
+  reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+  thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
+
+  // Increase size or deallocate completely.
+
+  if ( ( old_reduce_size < reduce_size ) ||
+       ( old_thread_size < thread_size ) ||
+       ( ( reduce_size == 0 && thread_size == 0 ) &&
+         ( old_reduce_size != 0 || old_thread_size != 0 ) ) ) {
+
+    verify_is_process( "ThreadsExec::resize_scratch" , true );
+
+    s_threads_process.m_scratch_reduce_end = reduce_size ;
+    s_threads_process.m_scratch_thread_end = reduce_size + thread_size ;
+
+    execute_serial( & execute_resize_scratch );
+
+    s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
+  }
+
+  return s_threads_process.m_scratch.alloc_ptr() ;
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::print_configuration( std::ostream & s , const bool detail )
+{
+  verify_is_process("ThreadsExec::print_configuration",false);
+
+  fence();
+
+  const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+  const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+  const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+  // Forestall compiler warnings for unused variables.
+  (void) numa_count;
+  (void) cores_per_numa;
+  (void) threads_per_core;
+
+  s << "Kokkos::Threads" ;
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  s << " KOKKOS_HAVE_PTHREAD" ;
+#endif
+#if defined( KOKKOS_HAVE_HWLOC )
+  s << " hwloc[" << numa_count << "x" << cores_per_numa << "x" << threads_per_core << "]" ;
+#endif
+
+  if ( s_thread_pool_size[0] ) {
+    s << " threads[" << s_thread_pool_size[0] << "]"
+      << " threads_per_numa[" << s_thread_pool_size[1] << "]"
+      << " threads_per_core[" << s_thread_pool_size[2] << "]"
+      ;
+    if ( 0 == s_threads_process.m_pool_base ) { s << " Asynchronous" ; }
+    s << " ReduceScratch[" << s_current_reduce_size << "]"
+      << " SharedScratch[" << s_current_shared_size << "]" ;
+    s << std::endl ;
+
+    if ( detail ) {
+
+      for ( int i = 0 ; i < s_thread_pool_size[0] ; ++i ) {
+
+        ThreadsExec * const th = s_threads_exec[i] ;
+
+        if ( th ) {
+
+          const int rank_rev = th->m_pool_size - ( th->m_pool_rank + 1 );
+
+          s << " Thread[ " << th->m_pool_rank << " : "
+            << th->m_numa_rank << "." << th->m_numa_core_rank << " ]" ;
+
+          s << " Fan{" ;
+          for ( int j = 0 ; j < th->m_pool_fan_size ; ++j ) {
+            ThreadsExec * const thfan = th->m_pool_base[rank_rev+(1<<j)] ;
+            s << " [ " << thfan->m_pool_rank << " : "
+              << thfan->m_numa_rank << "." << thfan->m_numa_core_rank << " ]" ;
+          }
+          s << " }" ;
+
+          if ( th == & s_threads_process ) {
+            s << " is_process" ;
+          }
+        }
+        s << std::endl ;
+      }
+    }
+  }
+  else {
+    s << " not initialized" << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+int ThreadsExec::is_initialized()
+{ return 0 != s_threads_exec[0] ; }
+
+void ThreadsExec::initialize( unsigned thread_count ,
+                              unsigned use_numa_count ,
+                              unsigned use_cores_per_numa ,
+                              bool allow_asynchronous_threadpool )
+{
+  static const Sentinel sentinel ;
+
+  const bool is_initialized = 0 != s_thread_pool_size[0] ;
+
+  unsigned thread_spawn_failed = 0 ;
+
+  for ( int i = 0; i < ThreadsExec::MAX_THREAD_COUNT ; i++)
+    s_threads_exec[i] = NULL;
+
+  if ( ! is_initialized ) {
+
+    // If thread_count, use_numa_count, or use_cores_per_numa are zero
+    // then they will be given default values based upon hwloc detection
+    // and allowed asynchronous execution.
+
+    const bool hwloc_avail = hwloc::available();
+
+    if ( thread_count == 0 ) {
+      thread_count = hwloc_avail
+      ? Kokkos::hwloc::get_available_numa_count() *
+        Kokkos::hwloc::get_available_cores_per_numa() *
+        Kokkos::hwloc::get_available_threads_per_core()
+      : 1 ;
+    }
+
+    const unsigned thread_spawn_begin =
+      hwloc::thread_mapping( "Kokkos::Threads::initialize" ,
+                             allow_asynchronous_threadpool ,
+                             thread_count ,
+                             use_numa_count ,
+                             use_cores_per_numa ,
+                             s_threads_coord );
+
+    const std::pair<unsigned,unsigned> proc_coord = s_threads_coord[0] ;
+
+    if ( thread_spawn_begin ) {
+      // Synchronous with s_threads_coord[0] as the process core
+      // Claim entry #0 for binding the process core.
+      s_threads_coord[0] = std::pair<unsigned,unsigned>(~0u,~0u);
+    }
+
+    s_thread_pool_size[0] = thread_count ;
+    s_thread_pool_size[1] = s_thread_pool_size[0] / use_numa_count ;
+    s_thread_pool_size[2] = s_thread_pool_size[1] / use_cores_per_numa ;
+    s_current_function = & execute_function_noop ; // Initialization work function
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+      // If hwloc available then spawned thread will
+      // choose its own entry in 's_threads_coord'
+      // otherwise specify the entry.
+      s_current_function_arg = (void*)static_cast<uintptr_t>( hwloc_avail ? ~0u : ith );
+
+      // Spawn thread executing the 'driver()' function.
+      // Wait until spawned thread has attempted to initialize.
+      // If spawning and initialization is successfull then
+      // an entry in 's_threads_exec' will be assigned.
+      if ( ThreadsExec::spawn() ) {
+        wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+      }
+      if ( s_threads_process.m_pool_state == ThreadsExec::Terminating ) break ;
+    }
+
+    // Wait for all spawned threads to deactivate before zeroing the function.
+
+    for ( unsigned ith = thread_spawn_begin ; ith < thread_count ; ++ith ) {
+      // Try to protect against cache coherency failure by casting to volatile.
+      ThreadsExec * const th = ((ThreadsExec * volatile *)s_threads_exec)[ith] ;
+      if ( th ) {
+        wait_yield( th->m_pool_state , ThreadsExec::Active );
+      }
+      else {
+        ++thread_spawn_failed ;
+      }
+    }
+
+    s_current_function     = 0 ;
+    s_current_function_arg = 0 ;
+    s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+
+    if ( ! thread_spawn_failed ) {
+      // Bind process to the core on which it was located before spawning occured
+      Kokkos::hwloc::bind_this_thread( proc_coord );
+
+      if ( thread_spawn_begin ) { // Include process in pool.
+        const std::pair<unsigned,unsigned> coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+        s_threads_exec[0]                   = & s_threads_process ;
+        s_threads_process.m_numa_rank       = coord.first ;
+        s_threads_process.m_numa_core_rank  = coord.second ;
+        s_threads_process.m_pool_base       = s_threads_exec ;
+        s_threads_process.m_pool_rank       = thread_count - 1 ; // Reversed for scan-compatible reductions
+        s_threads_process.m_pool_size       = thread_count ;
+        s_threads_process.m_pool_fan_size   = fan_size( s_threads_process.m_pool_rank , s_threads_process.m_pool_size );
+        s_threads_pid[ s_threads_process.m_pool_rank ] = pthread_self();
+      }
+      else {
+        s_threads_process.m_pool_base = 0 ;
+        s_threads_process.m_pool_rank = 0 ;
+        s_threads_process.m_pool_size = 0 ;
+        s_threads_process.m_pool_fan_size = 0 ;
+      }
+
+      // Initial allocations:
+      ThreadsExec::resize_scratch( 1024 , 1024 );
+    }
+    else {
+      s_thread_pool_size[0] = 0 ;
+      s_thread_pool_size[1] = 0 ;
+      s_thread_pool_size[2] = 0 ;
+    }
+  }
+
+  if ( is_initialized || thread_spawn_failed ) {
+
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Threads::initialize ERROR" ;
+
+    if ( is_initialized ) {
+      msg << " : already initialized" ;
+    }
+    if ( thread_spawn_failed ) {
+      msg << " : failed to spawn " << thread_spawn_failed << " threads" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  // Init the array for used for arbitrarily sized atomics
+  Impl::init_lock_array_host_space();
+
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::finalize()
+{
+  verify_is_process("ThreadsExec::finalize",false);
+
+  fence();
+
+  resize_scratch(0,0);
+
+  const unsigned begin = s_threads_process.m_pool_base ? 1 : 0 ;
+
+  for ( unsigned i = s_thread_pool_size[0] ; begin < i-- ; ) {
+
+    if ( s_threads_exec[i] ) {
+
+      s_threads_exec[i]->m_pool_state = ThreadsExec::Terminating ;
+
+      wait_yield( s_threads_process.m_pool_state , ThreadsExec::Inactive );
+
+      s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+    }
+
+    s_threads_pid[i] = 0 ;
+  }
+
+  if ( s_threads_process.m_pool_base ) {
+    ( & s_threads_process )->~ThreadsExec();
+    s_threads_exec[0] = 0 ;
+  }
+
+  Kokkos::hwloc::unbind_this_thread();
+
+  s_thread_pool_size[0] = 0 ;
+  s_thread_pool_size[1] = 0 ;
+  s_thread_pool_size[2] = 0 ;
+
+  // Reset master thread to run solo.
+  s_threads_process.m_numa_rank       = 0 ;
+  s_threads_process.m_numa_core_rank  = 0 ;
+  s_threads_process.m_pool_base       = 0 ;
+  s_threads_process.m_pool_rank       = 0 ;
+  s_threads_process.m_pool_size       = 1 ;
+  s_threads_process.m_pool_fan_size   = 0 ;
+  s_threads_process.m_pool_state = ThreadsExec::Inactive ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+Threads & Threads::instance(int)
+{
+  static Threads t ;
+  return t ;
+}
+
+int Threads::thread_pool_size( int depth )
+{
+  return Impl::s_thread_pool_size[depth];
+}
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+int Threads::thread_pool_rank()
+{
+  const pthread_t pid = pthread_self();
+  int i = 0;
+  while ( ( i < Impl::s_thread_pool_size[0] ) && ( pid != Impl::s_threads_pid[i] ) ) { ++i ; }
+  return i ;
+}
+#endif
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) || defined( KOKKOS_HAVE_WINTHREAD ) */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..38206979770984ce69bdca68d09ccd8a1c0ab3bd
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp
@@ -0,0 +1,465 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADSEXEC_HPP
+#define KOKKOS_THREADSEXEC_HPP
+
+#include <stdio.h>
+
+#include <utility>
+#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+class ThreadsExec {
+public:
+
+  // Fan array has log_2(NT) reduction threads plus 2 scan threads
+  // Currently limited to 16k threads.
+  enum { MAX_FAN_COUNT    = 16 };
+  enum { MAX_THREAD_COUNT = 1 << ( MAX_FAN_COUNT - 2 ) };
+  enum { VECTOR_LENGTH    = 8 };
+
+  /** \brief States of a worker thread */
+  enum { Terminating ///<  Termination in progress
+       , Inactive    ///<  Exists, waiting for work
+       , Active      ///<  Exists, performing work
+       , Rendezvous  ///<  Exists, waiting in a barrier or reduce
+
+       , ScanCompleted
+       , ScanAvailable
+       , ReductionAvailable
+       };
+
+private:
+
+  friend class Kokkos::Threads ;
+
+  // Fan-in operations' root is the highest ranking thread
+  // to place the 'scan' reduction intermediate values on
+  // the threads that need them.
+  // For a simple reduction the thread location is arbitrary.
+
+  ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
+
+  Impl::AllocationTracker m_scratch ;
+  int           m_scratch_reduce_end ;
+  int           m_scratch_thread_end ;
+  int           m_numa_rank ;
+  int           m_numa_core_rank ;
+  int           m_pool_rank ;
+  int           m_pool_size ;
+  int           m_pool_fan_size ;
+  int volatile  m_pool_state ;  ///< State for global synchronizations
+
+
+  static void global_lock();
+  static void global_unlock();
+  static bool spawn();
+
+  static void execute_resize_scratch( ThreadsExec & , const void * );
+  static void execute_sleep(          ThreadsExec & , const void * );
+
+  ThreadsExec( const ThreadsExec & );
+  ThreadsExec & operator = ( const ThreadsExec & );
+
+  static void execute_serial( void (*)( ThreadsExec & , const void * ) );
+
+public:
+
+  KOKKOS_INLINE_FUNCTION int pool_size() const { return m_pool_size ; }
+  KOKKOS_INLINE_FUNCTION int pool_rank() const { return m_pool_rank ; }
+  KOKKOS_INLINE_FUNCTION int numa_rank() const { return m_numa_rank ; }
+  KOKKOS_INLINE_FUNCTION int numa_core_rank() const { return m_numa_core_rank ; }
+
+  static int get_thread_count();
+  static ThreadsExec * get_thread( const int init_thread_rank );
+
+  inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
+  KOKKOS_INLINE_FUNCTION  void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
+
+  KOKKOS_INLINE_FUNCTION  int volatile & state() { return m_pool_state ; }
+  KOKKOS_INLINE_FUNCTION  ThreadsExec * const * pool_base() const { return m_pool_base ; }
+
+  static void driver(void);
+
+  ~ThreadsExec();
+  ThreadsExec();
+
+  static void * resize_scratch( size_t reduce_size , size_t thread_size );
+
+  static void * root_reduce_scratch();
+
+  static bool is_process();
+
+  static void verify_is_process( const std::string & , const bool initialized );
+
+  static int is_initialized();
+
+  static void initialize( unsigned thread_count ,
+                          unsigned use_numa_count ,
+                          unsigned use_cores_per_numa ,
+                          bool allow_asynchronous_threadpool );
+
+  static void finalize();
+
+  /* Given a requested team size, return valid team size */
+  static unsigned team_size_valid( unsigned );
+
+  static void print_configuration( std::ostream & , const bool detail = false );
+
+  //------------------------------------
+
+  static void wait_yield( volatile int & , const int );
+
+  //------------------------------------
+  // All-thread functions:
+
+  inline
+  int all_reduce( const int value )
+    {
+      // Make sure there is enough scratch space:
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      *((volatile int*) reduce_memory()) = value ;
+
+      memory_fence();
+
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the reduction and broadcast
+
+        int accum = 0 ;
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          accum += *((volatile int *) get_thread( rank )->reduce_memory());
+        }
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          *((volatile int *) get_thread( rank )->reduce_memory()) = accum ;
+        }
+
+        memory_fence();
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          get_thread( rank )->m_pool_state = ThreadsExec::Active ;
+        }
+      }
+
+      return *((volatile int*) reduce_memory());
+    }
+
+  //------------------------------------
+  // All-thread functions:
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void fan_in_reduce( const FunctorType & f ) const
+    {
+      typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorFinal<     FunctorType , ArgTag > Final ;
+
+      const int rev_rank  = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+
+        ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
+
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+
+        Join::join( f , reduce_memory() , fan.reduce_memory() );
+      }
+
+      if ( ! rev_rank ) {
+        Final::final( f , reduce_memory() );
+      }
+    }
+
+  inline
+  void fan_in() const
+    {
+      const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
+      }
+    }
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void scan_large( const FunctorType & f )
+    {
+      // Sequence of states:
+      //  0) Active             : entry and exit state
+      //  1) ReductionAvailable : reduction value available
+      //  2) ScanAvailable      : inclusive scan value available
+      //  3) Rendezvous         : All threads inclusive scan value are available
+      //  4) ScanCompleted      : exclusive scan value copied
+
+      typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > Init ;
+
+      typedef typename Traits::value_type scalar_type ;
+
+      const int      rev_rank = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Traits::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_memory();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
+
+        // Wait: Active -> ReductionAvailable (or ScanAvailable)
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
+        Join::join( f , work_value , fan.reduce_memory() );
+      }
+
+      // Copy reduction value to scan value before releasing from this phase.
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i] ; }
+
+      if ( rev_rank ) {
+
+        // Set: Active -> ReductionAvailable
+        m_pool_state = ThreadsExec::ReductionAvailable ;
+
+        // Wait for contributing threads' scan value to be available.
+        if ( ( 1 << m_pool_fan_size ) < ( m_pool_rank + 1 ) ) {
+          ThreadsExec & th = *m_pool_base[ rev_rank + ( 1 << m_pool_fan_size ) ] ;
+
+          // Wait: Active             -> ReductionAvailable
+          // Wait: ReductionAvailable -> ScanAvailable
+          Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
+          Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
+
+          Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
+        }
+
+        // This thread has completed inclusive scan
+        // Set: ReductionAvailable -> ScanAvailable
+        m_pool_state = ThreadsExec::ScanAvailable ;
+
+        // Wait for all threads to complete inclusive scan
+        // Wait: ScanAvailable -> Rendezvous
+        Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
+      }
+
+      //--------------------------------
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
+        // Wait: ReductionAvailable -> ScanAvailable
+        Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
+        // Set: ScanAvailable -> Rendezvous
+        fan.m_pool_state = ThreadsExec::Rendezvous ;
+      }
+
+      // All threads have completed the inclusive scan.
+      // All non-root threads are in the Rendezvous state.
+      // Threads are free to overwrite their reduction value.
+      //--------------------------------
+
+      if ( ( rev_rank + 1 ) < m_pool_size ) {
+        // Exclusive scan: copy the previous thread's inclusive scan value
+
+        ThreadsExec & th = *m_pool_base[ rev_rank + 1 ] ; // Not the root thread
+
+        const scalar_type * const src_value = ((scalar_type *)th.reduce_memory()) + count ;
+
+        for ( unsigned j = 0 ; j < count ; ++j ) { work_value[j] = src_value[j]; }
+      }
+      else {
+        (void) Init::init( f , work_value );
+      }
+
+      //--------------------------------
+      // Wait for all threads to copy previous thread's inclusive scan value
+      // Wait for all threads: Rendezvous -> ScanCompleted
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
+      }
+      if ( rev_rank ) {
+        // Set: ScanAvailable -> ScanCompleted
+        m_pool_state = ThreadsExec::ScanCompleted ;
+        // Wait: ScanCompleted -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
+      }
+      // Set: ScanCompleted -> Active
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  template< class FunctorType , class ArgTag >
+  inline
+  void scan_small( const FunctorType & f )
+    {
+      typedef Kokkos::Impl::FunctorValueTraits< FunctorType , ArgTag > Traits ;
+      typedef Kokkos::Impl::FunctorValueJoin<   FunctorType , ArgTag > Join ;
+      typedef Kokkos::Impl::FunctorValueInit<   FunctorType , ArgTag > Init ;
+
+      typedef typename Traits::value_type scalar_type ;
+
+      const int      rev_rank = m_pool_size - ( m_pool_rank + 1 );
+      const unsigned count    = Traits::value_count( f );
+
+      scalar_type * const work_value = (scalar_type *) reduce_memory();
+
+      //--------------------------------
+      // Fan-in reduction with highest ranking thread as the root
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        // Wait: Active -> Rendezvous
+        Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
+      }
+
+      for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
+
+      if ( rev_rank ) {
+        m_pool_state = ThreadsExec::Rendezvous ;
+        // Wait: Rendezvous -> Active
+        Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
+      }
+      else {
+        // Root thread does the thread-scan before releasing threads
+
+        scalar_type * ptr_prev = 0 ;
+
+        for ( int rank = 0 ; rank < m_pool_size ; ++rank ) {
+          scalar_type * const ptr = (scalar_type *) get_thread( rank )->reduce_memory();
+          if ( rank ) {
+            for ( unsigned i = 0 ; i < count ; ++i ) { ptr[i] = ptr_prev[ i + count ]; }
+            Join::join( f , ptr + count , ptr );
+          }
+          else {
+            (void) Init::init( f , ptr );
+          }
+          ptr_prev = ptr ;
+        }
+      }
+
+      for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
+        m_pool_base[ rev_rank + (1<<i) ]->m_pool_state = ThreadsExec::Active ;
+      }
+    }
+
+  //------------------------------------
+  /** \brief  Wait for previous asynchronous functor to
+   *          complete and release the Threads device.
+   *          Acquire the Threads device and start this functor.
+   */
+  static void start( void (*)( ThreadsExec & , const void * ) , const void * );
+
+  static int  in_parallel();
+  static void fence();
+  static bool sleep();
+  static bool wake();
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+inline int Threads::in_parallel()
+{ return Impl::ThreadsExec::in_parallel(); }
+
+inline int Threads::is_initialized()
+{ return Impl::ThreadsExec::is_initialized(); }
+
+inline void Threads::initialize(
+  unsigned threads_count ,
+  unsigned use_numa_count ,
+  unsigned use_cores_per_numa ,
+  bool allow_asynchronous_threadpool )
+{
+  Impl::ThreadsExec::initialize( threads_count , use_numa_count , use_cores_per_numa , allow_asynchronous_threadpool );
+}
+
+inline void Threads::finalize()
+{
+  Impl::ThreadsExec::finalize();
+}
+
+inline void Threads::print_configuration( std::ostream & s , const bool detail )
+{
+  Impl::ThreadsExec::print_configuration( s , detail );
+}
+
+inline bool Threads::sleep()
+{ return Impl::ThreadsExec::sleep() ; }
+
+inline bool Threads::wake()
+{ return Impl::ThreadsExec::wake() ; }
+
+inline void Threads::fence()
+{ Impl::ThreadsExec::fence() ; }
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADSEXEC_HPP */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..40d5efd0fe21e5db54bee49ac98e9bc1af1b12bd
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec_base.cpp
@@ -0,0 +1,254 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+/* Standard 'C' Linux libraries */
+
+#include <pthread.h>
+#include <sched.h>
+#include <errno.h>
+
+/* Standard C++ libaries */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Threads.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+pthread_mutex_t host_internal_pthread_mutex = PTHREAD_MUTEX_INITIALIZER ;
+
+// Pthreads compatible driver.
+// Recovery from an exception would require constant intra-thread health
+// verification; which would negatively impact runtime.  As such simply
+// abort the process.
+
+void * internal_pthread_driver( void * )
+{
+  try {
+    ThreadsExec::driver();
+  }
+  catch( const std::exception & x ) {
+    std::cerr << "Exception thrown from worker thread: " << x.what() << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  catch( ... ) {
+    std::cerr << "Exception thrown from worker thread" << std::endl ;
+    std::cerr.flush();
+    std::abort();
+  }
+  return NULL ;
+}
+
+} // namespace
+
+//----------------------------------------------------------------------------
+// Spawn a thread
+
+bool ThreadsExec::spawn()
+{
+  bool result = false ;
+
+  pthread_attr_t attr ;
+
+  if ( 0 == pthread_attr_init( & attr ) ||
+       0 == pthread_attr_setscope(       & attr, PTHREAD_SCOPE_SYSTEM ) ||
+       0 == pthread_attr_setdetachstate( & attr, PTHREAD_CREATE_DETACHED ) ) {
+
+    pthread_t pt ;
+
+    result = 0 == pthread_create( & pt, & attr, internal_pthread_driver, 0 );
+  }
+
+  pthread_attr_destroy( & attr );
+
+  return result ;
+}
+
+//----------------------------------------------------------------------------
+
+bool ThreadsExec::is_process()
+{
+  static const pthread_t master_pid = pthread_self();
+
+  return pthread_equal( master_pid , pthread_self() );
+}
+
+void ThreadsExec::global_lock()
+{
+  pthread_mutex_lock( & host_internal_pthread_mutex );
+}
+
+void ThreadsExec::global_unlock()
+{
+  pthread_mutex_unlock( & host_internal_pthread_mutex );
+}
+
+//----------------------------------------------------------------------------
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value )
+{
+  while ( value == flag ) { sched_yield(); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+/* end #if defined( KOKKOS_HAVE_PTHREAD ) */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_HAVE_WINTHREAD )
+
+/* Windows libraries */
+#include <windows.h>
+#include <process.h>
+
+/* Standard C++ libaries */
+
+#include <cstdlib>
+#include <string>
+#include <iostream>
+#include <stdexcept>
+
+#include <Kokkos_Threads.hpp>
+
+//----------------------------------------------------------------------------
+// Driver for each created pthread
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+unsigned WINAPI internal_winthread_driver( void * arg )
+{
+  ThreadsExec::driver();
+
+  return 0 ;
+}
+
+class ThreadLockWindows {
+private:
+  CRITICAL_SECTION  m_handle ;
+
+  ~ThreadLockWindows()
+  { DeleteCriticalSection( & m_handle ); }
+
+  ThreadLockWindows();
+  { InitializeCriticalSection( & m_handle ); }
+
+  ThreadLockWindows( const ThreadLockWindows & );
+  ThreadLockWindows & operator = ( const ThreadLockWindows & );
+
+public:
+
+  static ThreadLockWindows & singleton();
+
+  void lock()
+  { EnterCriticalSection( & m_handle ); }
+
+  void unlock()
+  { LeaveCriticalSection( & m_handle ); }
+};
+
+ThreadLockWindows & ThreadLockWindows::singleton()
+{ static ThreadLockWindows self ; return self ; }
+
+} // namespace <>
+} // namespace Kokkos
+} // namespace Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Spawn this thread
+
+bool ThreadsExec::spawn()
+{
+  unsigned Win32ThreadID = 0 ;
+
+  HANDLE handle =
+    _beginthreadex(0,0,internal_winthread_driver,0,0, & Win32ThreadID );
+
+  return ! handle ;
+}
+
+bool ThreadsExec::is_process() { return true ; }
+
+void ThreadsExec::global_lock()
+{ ThreadLockWindows::singleton().lock(); }
+
+void ThreadsExec::global_unlock()
+{ ThreadLockWindows::singleton().unlock(); }
+
+void ThreadsExec::wait_yield( volatile int & flag , const int value ) {}
+{
+  while ( value == flag ) { Sleep(0); }
+}
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* end #elif defined( KOKKOS_HAVE_WINTHREAD ) */
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..53b5eb01dff4f745ef3e8486394dceda96457638
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp
@@ -0,0 +1,730 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADSTEAM_HPP
+#define KOKKOS_THREADSTEAM_HPP
+
+#include <stdio.h>
+
+#include <utility>
+#include <impl/Kokkos_spinwait.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+#include <Kokkos_Atomic.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class > struct ThreadsExecAdapter ;
+
+//----------------------------------------------------------------------------
+
+class ThreadsExecTeamMember {
+private:
+
+  enum { TEAM_REDUCE_SIZE = 512 };
+
+  typedef Kokkos::Threads execution_space ;
+  typedef execution_space::scratch_memory_space space ;
+
+  ThreadsExec * const   m_exec ;
+  ThreadsExec * const * m_team_base ; ///< Base for team fan-in
+  space                 m_team_shared ;
+  int                   m_team_shared_size ;
+  int                   m_team_size ;
+  int                   m_team_rank ;
+  int                   m_team_rank_rev ;
+  int                   m_league_size ;
+  int                   m_league_end ;
+  int                   m_league_rank ;
+
+  inline
+  void set_team_shared()
+    { new( & m_team_shared ) space( ((char *) (*m_team_base)->scratch_memory()) + TEAM_REDUCE_SIZE , m_team_shared_size ); }
+  
+public:
+
+  // Fan-in and wait until the matching fan-out is called.
+  // The root thread which does not wait will return true.
+  // All other threads will return false during the fan-out.
+  KOKKOS_INLINE_FUNCTION bool team_fan_in() const
+    {
+      int n , j ;
+
+      // Wait for fan-in threads
+      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
+        Impl::spinwait( m_team_base[j]->state() , ThreadsExec::Active );
+      }
+
+      // If not root then wait for release
+      if ( m_team_rank_rev ) {
+        m_exec->state() = ThreadsExec::Rendezvous ;
+        Impl::spinwait( m_exec->state() , ThreadsExec::Rendezvous );
+      }
+
+      return ! m_team_rank_rev ;
+    }
+
+  KOKKOS_INLINE_FUNCTION void team_fan_out() const
+    {
+      int n , j ;
+      for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
+        m_team_base[j]->state() = ThreadsExec::Active ;
+      }
+    }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION static int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
+
+  KOKKOS_INLINE_FUNCTION
+  const execution_space::scratch_memory_space & team_shmem() const
+    { return m_team_shared ; }
+
+  KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
+  KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
+  KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
+  KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
+
+  KOKKOS_INLINE_FUNCTION void team_barrier() const
+    {
+      team_fan_in();
+      team_fan_out();
+    }
+
+  template<class ValueType>
+  KOKKOS_INLINE_FUNCTION
+  void team_broadcast(ValueType& value, const int& thread_id) const
+  {
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { }
+#else
+    // Make sure there is enough scratch space:
+    typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
+                         , ValueType , void >::type type ;
+
+    if ( m_team_base ) {
+      type * const local_value = ((type*) m_team_base[0]->scratch_memory());
+      if(team_rank() == thread_id) *local_value = value;
+      memory_fence();
+      team_barrier();
+      value = *local_value;
+    }
+#endif
+  }
+
+  template< typename Type >
+  KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return Type(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(Type) < TEAM_REDUCE_SIZE , Type , void >::type type ;
+
+      if ( 0 == m_exec ) return value ;
+
+      *((volatile type*) m_exec->scratch_memory() ) = value ;
+
+      memory_fence();
+
+      type & accum = *((type *) m_team_base[0]->scratch_memory() );
+
+      if ( team_fan_in() ) {
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          accum += *((type *) m_team_base[i]->scratch_memory() );
+        }
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return accum ;
+    }
+#endif
+
+#ifdef KOKKOS_HAVE_CXX11
+  template< class ValueType, class JoinOp >
+  KOKKOS_INLINE_FUNCTION ValueType
+    team_reduce( const ValueType & value
+               , const JoinOp & op_in ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ValueType(); }
+  #else
+    {
+      typedef ValueType value_type;
+      const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
+  #endif
+#else // KOKKOS_HAVE_CXX11
+  template< class JoinOp >
+  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
+    team_reduce( const typename JoinOp::value_type & value
+               , const JoinOp & op ) const
+  #if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return typename JoinOp::value_type(); }
+  #else
+    {
+      typedef typename JoinOp::value_type value_type;
+  #endif
+#endif // KOKKOS_HAVE_CXX11
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
+                           , value_type , void >::type type ;
+
+      if ( 0 == m_exec ) return value ;
+
+      type * const local_value = ((type*) m_exec->scratch_memory());
+
+      // Set this thread's contribution
+      *local_value = value ;
+
+      // Fence to make sure the base team member has access:
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        type * const team_value = ((type*) m_team_base[0]->scratch_memory());
+
+        // Join to the team value:
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          op.join( *team_value , *((type*) m_team_base[i]->scratch_memory()) );
+        }
+
+        // Team base thread may "lap" member threads so copy out to their local value.
+        for ( int i = 1 ; i < m_team_size ; ++i ) {
+          *((type*) m_team_base[i]->scratch_memory()) = *team_value ;
+        }
+
+        // Fence to make sure all team members have access
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      // Value was changed by the team base
+      return *((type volatile const *) local_value);
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering
+   *          with intra-team non-deterministic ordering accumulation.
+   *
+   *  The global inter-team accumulation value will, at the end of the
+   *  league's parallel execution, be the scan's total.
+   *  Parallel execution ordering of the league's teams is non-deterministic.
+   *  As such the base value for each team's scan operation is similarly
+   *  non-deterministic.
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
+#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return ArgType(); }
+#else
+    {
+      // Make sure there is enough scratch space:
+      typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
+
+      if ( 0 == m_exec ) return type(0);
+
+      volatile type * const work_value  = ((type*) m_exec->scratch_memory());
+
+      *work_value = value ;
+
+      memory_fence();
+
+      if ( team_fan_in() ) {
+        // The last thread to synchronize returns true, all other threads wait for team_fan_out()
+        // m_team_base[0]                 == highest ranking team member
+        // m_team_base[ m_team_size - 1 ] == lowest ranking team member
+        //
+        // 1) copy from lower to higher rank, initialize lowest rank to zero
+        // 2) prefix sum from lowest to highest rank, skipping lowest rank
+
+        type accum = 0 ;
+
+        if ( global_accum ) {
+          for ( int i = m_team_size ; i-- ; ) {
+            type & val = *((type*) m_team_base[i]->scratch_memory());
+            accum += val ;
+          }
+          accum = atomic_fetch_add( global_accum , accum );
+        }
+
+        for ( int i = m_team_size ; i-- ; ) {
+          type & val = *((type*) m_team_base[i]->scratch_memory());
+          const type offset = accum ;
+          accum += val ;
+          val = offset ;
+        }
+
+        memory_fence();
+      }
+
+      team_fan_out();
+
+      return *work_value ;
+    }
+#endif
+
+  /** \brief  Intra-team exclusive prefix sum with team_rank() ordering.
+   *
+   *  The highest rank thread can compute the reduction total as
+   *    reduction_total = dev.team_scan( value ) + value ;
+   */
+  template< typename ArgType >
+  KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value ) const
+    { return this-> template team_scan<ArgType>( value , 0 ); }
+
+
+  //----------------------------------------
+  // Private for the driver
+
+  template< class Arg0 , class Arg1 >
+  ThreadsExecTeamMember( Impl::ThreadsExec * exec
+                       , const TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > & team 
+                       , const int shared_size )
+    : m_exec( exec )
+    , m_team_base(0)
+    , m_team_shared(0,0)
+    , m_team_shared_size( shared_size )
+    , m_team_size(0)
+    , m_team_rank(0)
+    , m_team_rank_rev(0)
+    , m_league_size(0)
+    , m_league_end(0)
+    , m_league_rank(0)
+    {
+      if ( team.league_size() ) {
+        // Execution is using device-team interface:
+
+        const int pool_rank_rev = m_exec->pool_size() - ( m_exec->pool_rank() + 1 );
+        const int team_rank_rev = pool_rank_rev % team.team_alloc();
+
+        // May be using fewer threads per team than a multiple of threads per core,
+        // some threads will idle.
+
+        if ( team_rank_rev < team.team_size() ) {
+          const size_t pool_league_size     = m_exec->pool_size() / team.team_alloc() ;
+          const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
+          const size_t pool_league_rank     = pool_league_size - ( pool_league_rank_rev + 1 );
+
+          m_team_base        = m_exec->pool_base() + team.team_alloc() * pool_league_rank_rev ;
+          m_team_size        = team.team_size() ;
+          m_team_rank        = team.team_size() - ( team_rank_rev + 1 );
+          m_team_rank_rev    = team_rank_rev ;
+          m_league_size      = team.league_size();
+
+          m_league_rank      = ( team.league_size() *  pool_league_rank    ) / pool_league_size ;
+          m_league_end       = ( team.league_size() * (pool_league_rank+1) ) / pool_league_size ;
+
+          set_team_shared();
+        }
+      }
+    }
+
+  ThreadsExecTeamMember()
+    : m_exec(0)
+    , m_team_base(0)
+    , m_team_shared(0,0)
+    , m_team_shared_size(0)
+    , m_team_size(1)
+    , m_team_rank(0)
+    , m_team_rank_rev(0)
+    , m_league_size(1)
+    , m_league_end(0)
+    , m_league_rank(0)
+    {}
+
+  inline
+  ThreadsExec & threads_exec_team_base() const { return m_team_base ? **m_team_base : *m_exec ; }
+
+  bool valid() const
+    { return m_league_rank < m_league_end ; }
+
+  void next()
+    {
+      if ( ++m_league_rank < m_league_end ) {
+        team_barrier();
+        set_team_shared();
+      }
+    }
+
+  void set_league_shmem( const int arg_league_rank
+                       , const int arg_league_size
+                       , const int arg_shmem_size
+                       )
+    {
+      m_league_rank = arg_league_rank ;
+      m_league_size = arg_league_size ;
+      m_team_shared_size = arg_shmem_size ;
+      set_team_shared();
+    }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+template< class Arg0 , class Arg1 >
+class TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >
+{
+private:
+
+  int m_league_size ;
+  int m_team_size ;
+  int m_team_alloc ;
+
+  inline
+  void init( const int league_size_request 
+           , const int team_size_request )
+   {
+      const int pool_size  = execution_space::thread_pool_size(0);
+      const int team_max   = execution_space::thread_pool_size(1);
+      const int team_grain = execution_space::thread_pool_size(2);
+
+      m_league_size = league_size_request ;
+
+      m_team_size = team_size_request < team_max ?
+                    team_size_request : team_max ;
+
+      // Round team size up to a multiple of 'team_gain'
+      const int team_size_grain = team_grain * ( ( m_team_size + team_grain - 1 ) / team_grain );
+      const int team_count      = pool_size / team_size_grain ;
+
+      // Constraint : pool_size = m_team_alloc * team_count
+      m_team_alloc = pool_size / team_count ;
+   }
+
+
+public:
+
+  //! Tag this class as a kokkos execution policy
+  typedef TeamPolicy       execution_policy ; 
+  typedef Kokkos::Threads  execution_space ;
+
+  typedef typename
+    Impl::if_c< ! Impl::is_same< Kokkos::Threads , Arg0 >::value , Arg0 , Arg1 >::type
+      work_tag ;
+
+  //----------------------------------------
+
+  template< class FunctorType >
+  inline static
+  int team_size_max( const FunctorType & )
+    { return execution_space::thread_pool_size(1); }
+
+  template< class FunctorType >
+  static int team_size_recommended( const FunctorType & )
+    { return execution_space::thread_pool_size(2); }
+
+
+  template< class FunctorType >
+  inline static
+  int team_size_recommended( const FunctorType &, const int& )
+    { return execution_space::thread_pool_size(2); }
+
+  //----------------------------------------
+
+  inline int team_size() const { return m_team_size ; }
+  inline int team_alloc() const { return m_team_alloc ; }
+  inline int league_size() const { return m_league_size ; }
+
+  /** \brief  Specify league size, request team size */
+  TeamPolicy( execution_space & , int league_size_request , int team_size_request , int vector_length_request = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    { init(league_size_request,team_size_request); (void) vector_length_request; }
+
+  TeamPolicy( int league_size_request , int team_size_request , int vector_length_request = 1 )
+    : m_league_size(0)
+    , m_team_size(0)
+    , m_team_alloc(0)
+    { init(league_size_request,team_size_request); (void) vector_length_request; }
+
+  typedef Impl::ThreadsExecTeamMember member_type ;
+
+  friend class Impl::ThreadsExecTeamMember ;
+};
+
+
+} /* namespace Kokkos */
+
+
+namespace Kokkos {
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>
+TeamThreadRange(const Impl::ThreadsExecTeamMember& thread, const iType& count)
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,count);
+}
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>
+TeamThreadRange( const Impl::ThreadsExecTeamMember& thread
+               , const iType & begin
+               , const iType & end
+               )
+{
+  return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>(thread,begin,end);
+}
+
+
+template<typename iType>
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >
+  ThreadVectorRange(const Impl::ThreadsExecTeamMember& thread, const iType& count) {
+  return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >(thread,count);
+}
+
+
+KOKKOS_INLINE_FUNCTION
+Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember> PerTeam(const Impl::ThreadsExecTeamMember& thread) {
+  return Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>(thread);
+}
+
+KOKKOS_INLINE_FUNCTION
+Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember> PerThread(const Impl::ThreadsExecTeamMember& thread) {
+  return Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>(thread);
+}
+} // namespace Kokkos
+
+namespace Kokkos {
+
+  /** \brief  Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
+   *
+   * The range i=0..N-1 is mapped to all threads of the the calling thread team.
+   * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda) {
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, ValueType& result) {
+
+  result = ValueType();
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+
+  result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
+}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember>& loop_boundaries,
+                     const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+
+  init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
+}
+
+#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */
+
+} //namespace Kokkos
+
+
+namespace Kokkos {
+/** \brief  Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
+ * This functionality requires C++11 support.*/
+template<typename iType, class Lambda>
+KOKKOS_INLINE_FUNCTION
+void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+    loop_boundaries, const Lambda& lambda) {
+  #ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+  #pragma ivdep
+  #endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
+    lambda(i);
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
+ * val is performed and put into result. This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, ValueType& result) {
+  result = ValueType();
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    result+=tmp;
+  }
+}
+
+/** \brief  Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
+ * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
+ * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
+ * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
+ * '1 for *'). This functionality requires C++11 support.*/
+template< typename iType, class Lambda, typename ValueType, class JoinType >
+KOKKOS_INLINE_FUNCTION
+void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
+
+  ValueType result = init_result;
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    ValueType tmp = ValueType();
+    lambda(i,tmp);
+    join(result,tmp);
+  }
+  init_result = result;
+}
+
+/** \brief  Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
+ *          for each i=0..N-1.
+ *
+ * The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
+ * Depending on the target execution space the operator might be called twice: once with final=false
+ * and once with final=true. When final==true val contains the prefix sum value. The contribution of this
+ * "i" needs to be added to val no matter whether final==true or not. In a serial execution
+ * (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
+ * to the final sum value over all vector lanes.
+ * This functionality requires C++11 support.*/
+template< typename iType, class FunctorType >
+KOKKOS_INLINE_FUNCTION
+void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
+      loop_boundaries, const FunctorType & lambda) {
+
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
+  typedef typename ValueTraits::value_type value_type ;
+
+  value_type scan_val = value_type();
+
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+#pragma ivdep
+#endif
+  for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
+    lambda(i,scan_val,true);
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
+  lambda();
+}
+
+template<class FunctorType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda) {
+  if(single_struct.team_member.team_rank()==0) lambda();
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::VectorSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  lambda(val);
+}
+
+template<class FunctorType, class ValueType>
+KOKKOS_INLINE_FUNCTION
+void single(const Impl::ThreadSingleStruct<Impl::ThreadsExecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
+  if(single_struct.team_member.team_rank()==0) {
+    lambda(val);
+  }
+  single_struct.team_member.team_broadcast(val,0);
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADSTEAM_HPP */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..4b2a16912693abfac48ffe87d04f4a4c1c9aa885
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp
@@ -0,0 +1,427 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_THREADS_PARALLEL_HPP
+#define KOKKOS_THREADS_PARALLEL_HPP
+
+#include <vector>
+
+#include <Kokkos_Parallel.hpp>
+
+#include <impl/Kokkos_StaticAssert.hpp>
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelFor< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i );
+      }
+    }
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    driver( self.m_func , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() ) );
+
+    exec.fan_in();
+  }
+
+public:
+
+  ParallelFor( const FunctorType & functor
+             , const Policy      & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      ThreadsExec::start( & ParallelFor::execute , this );
+
+      ThreadsExec::fence();
+    }
+};
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelFor< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
+{
+private:
+
+  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >  Policy ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+  const int          m_shared ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member ) const
+    { m_func( member ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member ) const
+    { m_func( TagType() , member ); }
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelFor & self = * ((const ParallelFor *) arg );
+
+    typename Policy::member_type member( & exec , self.m_policy , self.m_shared );
+
+    for ( ; member.valid() ; member.next() ) {
+      self.ParallelFor::template driver< typename Policy::work_tag >( member );
+    }
+
+    exec.fan_in();
+  }
+
+public:
+
+  ParallelFor( const FunctorType & functor
+              , const Policy      & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+    {
+      ThreadsExec::resize_scratch( 0 , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelFor::execute , this );
+
+      ThreadsExec::fence();
+    }
+};
+
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelReduce< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
+  typedef typename Policy::work_tag                                   work_tag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i , update );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update );
+      }
+    }
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    driver( self.m_func
+          , ValueInit::init( self.m_func , exec.reduce_memory() )
+          , typename Policy::WorkRange( self.m_policy , exec.pool_rank() , exec.pool_size() )
+          );
+
+    exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
+  }
+
+public:
+
+  template< class HostViewType >
+  ParallelReduce( const FunctorType  & functor ,
+                  const Policy       & policy ,
+                  const HostViewType & result_view )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , 0 );
+
+      ThreadsExec::start( & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
+
+      ThreadsExec::fence();
+
+      if ( result_view.ptr_on_device() ) {
+        const unsigned n = ValueTraits::value_count( m_func );
+        for ( unsigned i = 0 ; i < n ; ++i ) { result_view.ptr_on_device()[i] = data[i]; }
+      }
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 >
+class ParallelReduce< FunctorType , Kokkos::TeamPolicy< Arg0 , Arg1 , Kokkos::Threads > >
+{
+private:
+
+  typedef TeamPolicy< Arg0 , Arg1 , Kokkos::Threads >                 Policy ;
+  typedef typename Policy::work_tag                                   work_tag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+  const int          m_shared ;
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member
+             , reference_type update ) const
+    { m_func( member , update ); }
+
+  template< class TagType >
+  KOKKOS_FORCEINLINE_FUNCTION
+  void driver( typename Impl::enable_if< ! Impl::is_same< TagType , void >::value ,
+                 const typename Policy::member_type & >::type member
+             , reference_type update ) const
+    { m_func( TagType() , member , update ); }
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelReduce & self = * ((const ParallelReduce *) arg );
+
+    // Initialize thread-local value
+    reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
+
+    typename Policy::member_type member( & exec , self.m_policy , self.m_shared );
+    for ( ; member.valid() ; member.next() ) {
+      self.ParallelReduce::template driver< work_tag >( member , update );
+    }
+
+    exec.template fan_in_reduce< FunctorType , work_tag >( self.m_func );
+  }
+
+public:
+
+  ParallelReduce( const FunctorType & functor
+                , const Policy      & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelReduce::execute , this );
+
+      ThreadsExec::fence();
+    }
+
+  template< class ViewType >
+  ParallelReduce( const FunctorType & functor
+                , const Policy      & policy
+                , const ViewType    & result )
+    : m_func( functor )
+    , m_policy( policy )
+    , m_shared( FunctorTeamShmemSize< FunctorType >::value( functor , policy.team_size() ) )
+    {
+      ThreadsExec::resize_scratch( ValueTraits::value_size( m_func ) , Policy::member_type::team_reduce_size() + m_shared );
+
+      ThreadsExec::start( & ParallelReduce::execute , this );
+
+      const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
+
+      ThreadsExec::fence();
+
+      const unsigned n = ValueTraits::value_count( m_func );
+      for ( unsigned i = 0 ; i < n ; ++i ) { result.ptr_on_device()[i] = data[i]; }
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+template< class FunctorType , class Arg0 , class Arg1 , class Arg2 >
+class ParallelScan< FunctorType , Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > >
+{
+private:
+
+  typedef Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Threads > Policy ;
+  typedef typename Policy::work_tag                                   work_tag ;
+  typedef Kokkos::Impl::FunctorValueTraits< FunctorType , work_tag >  ValueTraits ;
+  typedef Kokkos::Impl::FunctorValueInit<   FunctorType , work_tag >  ValueInit ;
+
+  typedef typename ValueTraits::pointer_type    pointer_type ;
+  typedef typename ValueTraits::reference_type  reference_type ;
+
+  const FunctorType  m_func ;
+  const Policy       m_policy ;
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const bool    final
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( i , update , final );
+      }
+    }
+
+  template< class PType >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void driver( typename Impl::enable_if<
+                 ( ! Impl::is_same< typename PType::work_tag , void >::value )
+                 , const FunctorType & >::type functor
+             , reference_type update
+             , const bool    final
+             , const PType & range )
+    {
+      const typename PType::member_type e = range.end();
+      for ( typename PType::member_type i = range.begin() ; i < e ; ++i ) {
+        functor( typename PType::work_tag() , i , update , final );
+      }
+    }
+
+  static void execute( ThreadsExec & exec , const void * arg )
+  {
+    const ParallelScan & self = * ((const ParallelScan *) arg );
+
+    const typename Policy::WorkRange range( self.m_policy , exec.pool_rank() , exec.pool_size() );
+
+    reference_type update = ValueInit::init( self.m_func , exec.reduce_memory() );
+
+    driver( self.m_func , update , false , range );
+
+    //  exec.<FunctorType,work_tag>scan_large( self.m_func );
+    exec.template scan_small<FunctorType,work_tag>( self.m_func );
+
+    driver( self.m_func , update , true , range );
+
+    exec.fan_in();
+  }
+
+public:
+
+  ParallelScan( const FunctorType & functor , const Policy & policy )
+    : m_func( functor )
+    , m_policy( policy )
+    {
+      ThreadsExec::resize_scratch( 2 * ValueTraits::value_size( m_func ) , 0 );
+      ThreadsExec::start( & ParallelScan::execute , this );
+      ThreadsExec::fence();
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #define KOKKOS_THREADS_PARALLEL_HPP */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..8ad7f15ecc2f9c0b6c623088d3fd341dc29c0c03
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.cpp
@@ -0,0 +1,599 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <stdio.h>
+#include <iostream>
+#include <sstream>
+#include <Threads/Kokkos_Threads_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember< Kokkos::Threads , void , void > Task ;
+
+namespace {
+
+int    volatile s_count_serial = 0 ;
+int    volatile s_count_team   = 0 ;
+Task * volatile s_ready_team   = 0 ;
+Task * volatile s_ready_serial = 0 ;
+Task * const    s_lock   = reinterpret_cast<Task*>( ~((unsigned long)0) );
+Task * const    s_denied = reinterpret_cast<Task*>( ~((unsigned long)0) - 1 );
+
+} /* namespace */
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Threads >::TaskPolicy
+  ( const unsigned arg_default_dependence_capacity
+  , const unsigned arg_team_size
+  )
+  : m_default_dependence_capacity( arg_default_dependence_capacity )
+  , m_team_size( arg_team_size )
+{
+  const int threads_total    = Threads::thread_pool_size(0);
+  const int threads_per_numa = Threads::thread_pool_size(1);
+  const int threads_per_core = Threads::thread_pool_size(2);
+
+  if ( 0 == arg_team_size ) {
+    // If a team task then claim for execution until count is zero
+    // Issue: team collectives cannot assume which pool members are in the team.
+    // Issue: team must only span a single NUMA region.
+
+    // If more than one thread per core then map cores to work team,
+    // else  map numa to work team.
+
+    if      ( 1 < threads_per_core ) m_team_size = threads_per_core ;
+    else if ( 1 < threads_per_numa ) m_team_size = threads_per_numa ;
+    else                             m_team_size = 1 ;
+  }
+
+  // Verify a valid team size
+  const bool valid_team_size =
+    ( 0 < m_team_size && m_team_size <= threads_total ) &&
+    (
+      ( 1                == m_team_size ) ||
+      ( threads_per_core == m_team_size ) ||
+      ( threads_per_numa == m_team_size )
+    );
+
+  if ( ! valid_team_size ) {
+    std::ostringstream msg ;
+
+    msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Threads > ERROR"
+        << " invalid team_size(" << m_team_size << ")"
+        << " threads_per_core(" << threads_per_core << ")"
+        << " threads_per_numa(" << threads_per_numa << ")"
+        << " threads_total(" << threads_total << ")"
+        ;
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+
+  }
+}
+
+TaskPolicy< Kokkos::Threads >::member_type &
+TaskPolicy< Kokkos::Threads >::member_single()
+{
+  static member_type s ;
+  return s ;
+}
+
+void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Threads > & policy )
+{
+  typedef Kokkos::Impl::ThreadsExecTeamMember member_type ;
+
+  enum { BASE_SHMEM = 1024 };
+
+  void * const arg = reinterpret_cast<void*>( long( policy.m_team_size ) );
+
+  Kokkos::Impl::ThreadsExec::resize_scratch( 0 , member_type::team_reduce_size() + BASE_SHMEM );
+  Kokkos::Impl::ThreadsExec::start( & Impl::Task::execute_ready_tasks_driver , arg );
+  Kokkos::Impl::ThreadsExec::fence();
+}
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+void Task::throw_error_verify_type()
+{
+  Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::verify_type ERROR");
+}
+
+void Task::deallocate( void * ptr )
+{
+  free( ptr );
+}
+
+void * Task::allocate( const unsigned n )
+{
+  void * const ptr = malloc(n);
+
+  return ptr ;
+}
+
+Task::~TaskMember()
+{
+}
+
+//----------------------------------------------------------------------------
+
+void Task::reschedule()
+{
+  // Reschedule transitions from executing back to waiting.
+  const int old_state = atomic_compare_exchange( & m_state , int(TASK_STATE_EXECUTING) , int(TASK_STATE_WAITING) );
+
+  if ( old_state != int(TASK_STATE_EXECUTING) ) {
+
+fprintf( stderr
+       , "reschedule ERROR task[%lx] state(%d)\n"
+       , (unsigned long) this
+       , old_state
+       );
+fflush(stderr);
+
+  }
+}
+
+void Task::schedule()
+{
+  //----------------------------------------
+  // State is either constructing or already waiting.
+  // If constructing then transition to waiting.
+
+  {
+    const int old_state = atomic_compare_exchange( & m_state , int(TASK_STATE_CONSTRUCTING) , int(TASK_STATE_WAITING) );
+    Task * const waitTask = *((Task * volatile const *) & m_wait );
+    Task * const next = *((Task * volatile const *) & m_next );
+
+    if ( s_denied == waitTask || 0 != next ||
+         ( old_state != int(TASK_STATE_CONSTRUCTING) &&
+           old_state != int(TASK_STATE_WAITING) ) ) {
+      fprintf(stderr,"Task::schedule task(0x%lx) STATE ERROR: state(%d) wait(0x%lx) next(0x%lx)\n"
+                    , (unsigned long) this
+                    , old_state
+                    , (unsigned long) waitTask
+                    , (unsigned long) next );
+      fflush(stderr);
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::Task spawn or respawn state error");
+    }
+  }
+
+  //----------------------------------------
+  // Insert this task into another dependence that is not complete
+  // Push on to the wait queue, fails if ( s_denied == m_dep[i]->m_wait )
+
+  bool insert_in_ready_queue = true ;
+
+  for ( int i = 0 ; i < m_dep_size && insert_in_ready_queue ; ) {
+
+    Task * const task_dep = m_dep[i] ;
+    Task * const head_value_old = *((Task * volatile *) & task_dep->m_wait );
+
+    if ( s_denied == head_value_old ) {
+      // Wait queue is closed, try again with the next queue
+      ++i ;
+    }
+    else {
+
+      // Wait queue is open and not locked.
+      // If CAS succeeds then have acquired the lock.
+
+      // Have exclusive access to this task.
+      // Assign m_next assuming a successfull insertion into the queue.
+      // Fence the memory assignment before attempting the CAS.
+
+      *((Task * volatile *) & m_next ) = head_value_old ;
+
+      memory_fence();
+
+      // Attempt to insert this task into the queue
+
+      Task * const wait_queue_head = atomic_compare_exchange( & task_dep->m_wait , head_value_old , this );
+
+      if ( head_value_old == wait_queue_head ) {
+        insert_in_ready_queue = false ;
+      }
+    }
+  }
+
+  //----------------------------------------
+  // All dependences are complete, insert into the ready list
+
+  if ( insert_in_ready_queue ) {
+
+    // Increment the count of ready tasks.
+    // Count is decremented when task is complete.
+
+    Task * volatile * queue = 0 ;
+
+    if ( m_serial ) {
+      atomic_increment( & s_count_serial );
+      queue = & s_ready_serial ;
+    }
+    else {
+      atomic_increment( & s_count_team );
+      queue = & s_ready_team ;
+    }
+
+    while ( insert_in_ready_queue ) {
+
+      Task * const head_value_old = *queue ;
+
+      if ( s_lock != head_value_old ) {
+        // Read the head of ready queue, if same as previous value then CAS locks the ready queue
+        // Only access via CAS
+
+        // Have exclusive access to this task, assign to head of queue, assuming successful insert
+        // Fence assignment before attempting insert.
+        *((Task * volatile *) & m_next ) = head_value_old ;
+
+        memory_fence();
+
+        Task * const ready_queue_head = atomic_compare_exchange( queue , head_value_old , this );
+
+        if ( head_value_old == ready_queue_head ) {
+          // Successful insert
+          insert_in_ready_queue = false ; // done
+        }
+      }
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void Task::assign( Task ** const lhs_ptr , Task * rhs )
+{
+  // Increment rhs reference count.
+  if ( rhs ) { atomic_increment( & rhs->m_ref_count ); }
+
+  // Assign the pointer and retrieve the previous value.
+
+  Task * const old_lhs = atomic_exchange( lhs_ptr , rhs );
+
+  if ( old_lhs ) {
+
+    // Decrement former lhs reference count.
+    // If reference count is zero task must be complete, then delete task.
+    // Task is ready for deletion when  wait == s_denied
+
+    int const count = atomic_fetch_add( & (old_lhs->m_ref_count) , -1 ) - 1 ;
+
+    // if 'count != 0' then 'old_lhs' may be deallocated before dereferencing
+    Task * const wait = count == 0 ? *((Task * const volatile *) & old_lhs->m_wait ) : (Task*) 0 ;
+
+    if ( count < 0 || ( count == 0 && wait != s_denied ) ) {
+
+      static const char msg_error_header[]  = "Kokkos::Impl::TaskManager<Kokkos::Threads>::assign ERROR deleting" ;
+
+      fprintf( stderr , "%s task(0x%lx) m_ref_count(%d) , m_wait(0x%ld)\n"
+                      , msg_error_header
+                      , (unsigned long) old_lhs
+                      , count
+                      , (unsigned long) wait );
+      fflush(stderr);
+
+      Kokkos::Impl::throw_runtime_exception( msg_error_header );
+    }
+
+    if ( count == 0 ) {
+      // When 'count == 0' this thread has exclusive access to 'old_lhs'
+      const Task::function_dealloc_type d = old_lhs->m_dealloc ;
+      (*d)( old_lhs );
+    }
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+Task * Task::get_dependence( int i ) const
+{
+  Task * const t = m_dep[i] ;
+
+  if ( Kokkos::Experimental::TASK_STATE_EXECUTING != m_state || i < 0 || m_dep_size <= i || 0 == t ) {
+
+fprintf( stderr
+       , "TaskMember< Threads >::get_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) dep[%d] = %lx }\n"
+       , (unsigned long) this
+       , m_state
+       , m_dep_size
+       , i
+       , (unsigned long) t
+       );
+fflush( stderr );
+
+    Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::get_dependence ERROR");
+  }
+
+  return t ;
+}
+
+//----------------------------------------------------------------------------
+
+void Task::add_dependence( Task * before )
+{
+  if ( before != 0 ) {
+
+    int const state = *((volatile const int *) & m_state );
+
+    // Can add dependence during construction or during execution
+
+    if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == state ||
+           Kokkos::Experimental::TASK_STATE_EXECUTING    == state ) &&
+         m_dep_size < m_dep_capacity ) {
+
+      ++m_dep_size ;
+
+      assign( m_dep + (m_dep_size-1) , before );
+
+      memory_fence();
+    }
+    else {
+
+fprintf( stderr
+       , "TaskMember< Threads >::add_dependence ERROR : task[%lx]{ state(%d) dep_size(%d) m_dep_capacity(%d) }\n"
+       , (unsigned long) this
+       , m_state
+       , m_dep_size
+       , m_dep_capacity
+       );
+fflush( stderr );
+
+      Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::add_dependence ERROR");
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+void Task::clear_dependence()
+{
+  for ( int i = m_dep_size - 1 ; 0 <= i ; --i ) {
+    assign( m_dep + i , 0 );
+  }
+
+  *((volatile int *) & m_dep_size ) = 0 ;
+
+  memory_fence();
+}
+
+//----------------------------------------------------------------------------
+
+Task * Task::pop_ready_task( Task * volatile * const queue )
+{
+  Task * const task_old = *queue ;
+
+  if ( s_lock != task_old && 0 != task_old ) {
+
+    Task * const task = atomic_compare_exchange( queue , task_old , s_lock );
+
+    if ( task_old == task ) {
+
+      // May have acquired the lock and task.
+      // One or more other threads may have acquired this same task and lock
+      // due to respawning ABA race condition.
+      // Can only be sure of acquire with a successful state transition from waiting to executing
+
+      const int old_state = atomic_compare_exchange( & task->m_state, int(TASK_STATE_WAITING), int(TASK_STATE_EXECUTING) );
+
+      if ( old_state == int(TASK_STATE_WAITING) ) {
+
+        // Transitioned this task from waiting to executing
+        // Update the queue to the next entry and release the lock
+
+        Task * const next_old = *((Task * volatile *) & task->m_next );
+
+        Task * const s = atomic_compare_exchange( queue , s_lock , next_old );
+
+        if ( s != s_lock ) {
+          fprintf(stderr,"Task::pop_ready_task( 0x%lx ) UNLOCK ERROR\n", (unsigned long) queue );
+          fflush(stderr);
+        }
+
+        *((Task * volatile *) & task->m_next ) = 0 ;
+
+        return task ;
+      }
+      else {
+        fprintf(stderr,"Task::pop_ready_task( 0x%lx ) task(0x%lx) state(%d) ERROR\n"
+                      , (unsigned long) queue
+                      , (unsigned long) task
+                      , old_state );
+        fflush(stderr);
+      }
+    }
+  }
+
+  return (Task *) 0 ;
+}
+
+
+void Task::complete_executed_task( Task * task , volatile int * const queue_count )
+{
+  // State is either executing or if respawned then waiting,
+  // try to transition from executing to complete.
+  // Reads the current value.
+
+  const int state_old =
+    atomic_compare_exchange( & task->m_state
+                           , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
+                           , int(Kokkos::Experimental::TASK_STATE_COMPLETE) );
+
+  if ( Kokkos::Experimental::TASK_STATE_WAITING == state_old ) {
+    task->schedule(); /* Task requested a respawn so reschedule it */
+  }
+  else if ( Kokkos::Experimental::TASK_STATE_EXECUTING != state_old ) {
+    fprintf( stderr
+           , "TaskMember< Threads >::execute_serial completion ERROR : task[%lx]{ state_old(%d) dep_size(%d) }\n"
+           , (unsigned long) & task
+           , state_old
+           , task->m_dep_size
+           );
+    fflush( stderr );
+  }
+  else {
+
+    // Clear dependences of this task before locking wait queue
+
+    task->clear_dependence();
+
+    // Stop other tasks from adding themselves to this task's wait queue.
+    // The wait queue is updated concurrently so guard with an atomic.
+    // Setting the wait queue to denied denotes delete-ability of the task by any thread.
+    // Therefore, once 'denied' the task pointer must be treated as invalid.
+
+    Task * wait_queue     = *((Task * volatile *) & task->m_wait );
+    Task * wait_queue_old = 0 ;
+
+    do {
+      wait_queue_old = wait_queue ;
+      wait_queue     = atomic_compare_exchange( & task->m_wait , wait_queue_old , s_denied );
+    } while ( wait_queue_old != wait_queue );
+
+    task = 0 ;
+
+    // Pop waiting tasks and schedule them
+    while ( wait_queue ) {
+      Task * const x = wait_queue ; wait_queue = x->m_next ; x->m_next = 0 ;
+      x->schedule();
+    }
+  }
+
+  atomic_decrement( queue_count );
+}
+
+//----------------------------------------------------------------------------
+
+void Task::execute_ready_tasks_driver( Kokkos::Impl::ThreadsExec & exec , const void * arg )
+{
+  typedef Kokkos::Impl::ThreadsExecTeamMember member_type ;
+
+  // Whole pool is calling this function
+
+  // Create the thread team member with shared memory for the given task.
+  const int team_size = reinterpret_cast<long>( arg );
+
+  member_type member( & exec , TeamPolicy< Kokkos::Threads >( 1 , team_size ) , 0 );
+
+  Kokkos::Impl::ThreadsExec & exec_team_base = member.threads_exec_team_base();
+
+  Task * volatile * const task_team_ptr = reinterpret_cast<Task**>( exec_team_base.reduce_memory() );
+
+  if ( member.team_fan_in() ) {
+    *task_team_ptr = 0 ;
+    Kokkos::memory_fence();
+  }
+  member.team_fan_out();
+
+  long int iteration_count = 0 ;
+
+  // Each team must iterate this loop synchronously to insure team-execution of team-task
+
+  while ( 0 < s_count_serial || 0 < s_count_team ) {
+
+    if ( member.team_rank() == 0 ) {
+      // Only one team member attempts to pop a team task
+      *task_team_ptr = pop_ready_task( & s_ready_team );
+    }
+
+    // Query if team acquired a team task
+    Task * const task_team = *task_team_ptr ;
+
+    if ( task_team ) {
+      // Set shared memory
+      member.set_league_shmem( 0 , 1 , task_team->m_shmem_size );
+
+      (*task_team->m_team)( task_team , member );
+
+      // Do not proceed until all members have completed the task,
+      // the task has been completed or rescheduled, and
+      // the team task pointer has been cleared.
+      if ( member.team_fan_in() ) {
+        complete_executed_task( task_team , & s_count_team );
+        *task_team_ptr = 0 ;
+        Kokkos::memory_fence();
+      }
+      member.team_fan_out();
+    }
+    else {
+      Task * const task_serial = pop_ready_task( & s_ready_serial );
+
+      if ( task_serial ) {
+        if ( task_serial->m_serial ) (*task_serial->m_serial)( task_serial );
+
+        complete_executed_task( task_serial , & s_count_serial );
+      }
+    }
+
+    ++iteration_count ;
+  }
+
+  exec.fan_in();
+}
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+
diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..024671324007da6f3dc668b113012234be73d77c
--- /dev/null
+++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_TaskPolicy.hpp
@@ -0,0 +1,584 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_THREADS_TASKPOLICY_HPP
+#define KOKKOS_THREADS_TASKPOLICY_HPP
+
+
+#include <Kokkos_Threads.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  Base class for all Kokkos::Threads tasks */
+template<>
+class TaskMember< Kokkos::Threads , void , void > {
+public:
+
+  typedef void         (* function_dealloc_type)( TaskMember * );
+  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+  typedef void         (* function_single_type) ( TaskMember * );
+  typedef void         (* function_team_type)   ( TaskMember * , Kokkos::Impl::ThreadsExecTeamMember & );
+
+private:
+
+  // Needed to disambiguate references to base class variables
+  // without triggering a false-positive on Intel compiler warning #955.
+  typedef TaskMember< Kokkos::Threads , void , void > SelfType ;
+
+  function_dealloc_type  m_dealloc ;      ///< Deallocation
+  function_verify_type   m_verify ;       ///< Result type verification
+  function_team_type     m_team ;         ///< Apply function
+  function_single_type   m_serial ;       ///< Apply function
+  TaskMember **          m_dep ;          ///< Dependences
+  TaskMember *           m_wait ;         ///< Linked list of tasks waiting on this task
+  TaskMember *           m_next ;         ///< Linked list of tasks waiting on a different task
+  int                    m_dep_capacity ; ///< Capacity of dependences
+  int                    m_dep_size ;     ///< Actual count of dependences
+  int                    m_shmem_size ;
+  int                    m_ref_count ;    ///< Reference count
+  int                    m_state ;        ///< State of the task
+
+  // 7 pointers + 5 integers
+
+#if defined( KOKKOS_HAVE_CXX11 )
+  TaskMember( const TaskMember & ) = delete ;
+  TaskMember & operator = ( const TaskMember & ) = delete ;
+#else
+  TaskMember( const TaskMember & );
+  TaskMember & operator = ( const TaskMember & );
+#endif
+
+  static void * allocate( const unsigned arg_size );
+  static void deallocate( void * );
+
+  template< class DerivedTaskType >
+  static
+  void deallocate( TaskMember * t )
+    {
+      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
+      ptr->~DerivedTaskType();
+      deallocate( (void*) ptr );
+    }
+
+  static TaskMember * pop_ready_task( TaskMember * volatile * const queue );
+  static void complete_executed_task( TaskMember * , volatile int * const );
+
+  static void throw_error_verify_type();
+
+protected:
+
+  TaskMember()
+    : m_dealloc(0)
+    , m_verify(0)
+    , m_team(0)
+    , m_serial(0)
+    , m_dep(0)
+    , m_wait(0)
+    , m_next(0)
+    , m_dep_capacity(0)
+    , m_dep_size(0)
+    , m_shmem_size(0)
+    , m_ref_count(0)
+    , m_state(0)
+    {}
+
+public:
+
+  static void execute_ready_tasks_driver( Kokkos::Impl::ThreadsExec & , const void * );
+
+  ~TaskMember();
+
+  template< typename ResultType >
+  KOKKOS_FUNCTION static
+  TaskMember * verify_type( TaskMember * t )
+    {
+      enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
+
+      if ( check_type && t != 0 ) {
+
+        // Verify that t->m_verify is this function
+        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
+
+        if ( t->m_verify != self ) {
+          t = 0 ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+          throw_error_verify_type();
+#endif
+        }
+      }
+      return t ;
+    }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *
+   *    class DerivedTaskType
+   *      : public TaskMember< Threads , DerivedType::value_type , FunctorType >
+   *      { ... };
+   *
+   *    class TaskMember< Threads , DerivedType::value_type , FunctorType >
+   *      : public TaskMember< Threads , DerivedType::value_type , void >
+   *      , public Functor
+   *      { ... };
+   *
+   *  If value_type != void
+   *    class TaskMember< Threads , value_type , void >
+   *      : public TaskMember< Threads , void , void >
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   *
+   */
+  //----------------------------------------
+
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< typename DerivedTaskType::result_type , void >::value
+                                                , TaskMember * >::type t )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename DerivedTaskType::result_type   result_type ;
+
+      DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
+
+      Kokkos::Impl::FunctorApply< functor_type , Tag , result_type & >
+        ::apply( (functor_type &) self , & self.m_result );
+    }
+
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< typename DerivedTaskType::result_type , void >::value
+                                                , TaskMember * >::type t )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+
+      DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
+
+      Kokkos::Impl::FunctorApply< functor_type , Tag , void >::apply( (functor_type &) self );
+    }
+
+  //----------------------------------------
+
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_team( typename Kokkos::Impl::enable_if<(
+                     Kokkos::Impl::is_same<Tag,void>::value
+                     &&
+                     Kokkos::Impl::is_same<typename DerivedTaskType::result_type,void>::value
+                   ), TaskMember * >::type t
+                 , Kokkos::Impl::ThreadsExecTeamMember & member
+                 )
+    {
+      DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
+
+      self.DerivedTaskType::functor_type::apply( member );
+    }
+
+  /** \brief  Allocate and construct a task */
+  template< class DerivedTaskType , class Tag >
+  KOKKOS_FUNCTION static
+  void apply_team( typename Kokkos::Impl::enable_if<(
+                     Kokkos::Impl::is_same<Tag,void>::value
+                     &&
+                     ! Kokkos::Impl::is_same<typename DerivedTaskType::result_type,void>::value
+                   ), TaskMember * >::type t
+                 , Kokkos::Impl::ThreadsExecTeamMember & member
+                 )
+    {
+      DerivedTaskType & self = * static_cast< DerivedTaskType * >(t);
+
+      self.DerivedTaskType::functor_type::apply( member , self.m_result );
+    }
+
+  //----------------------------------------
+
+  /** \brief  Allocate and construct a task */
+  template< class DerivedTaskType , class Tag >
+  static
+  TaskMember * create( const typename DerivedTaskType::functor_type &  arg_functor
+                     , const function_team_type                        arg_apply_team
+                     , const function_single_type                      arg_apply_single
+                     , const unsigned                                  arg_team_shmem
+                     , const unsigned                                  arg_dependence_capacity
+                     )
+    {
+      enum { padding_size = sizeof(DerivedTaskType) % sizeof(TaskMember*)
+                          ? sizeof(TaskMember*) - sizeof(DerivedTaskType) % sizeof(TaskMember*) : 0 };
+      enum { derived_size = sizeof(DerivedTaskType) + padding_size };
+
+      DerivedTaskType * const task =
+        new( allocate( derived_size + sizeof(TaskMember*) * arg_dependence_capacity ) )
+          DerivedTaskType( arg_functor );
+
+      task->SelfType::m_dealloc      = & TaskMember::template deallocate< DerivedTaskType > ;
+      task->SelfType::m_verify       = & TaskMember::template verify_type< typename DerivedTaskType::value_type > ;
+      task->SelfType::m_team         = arg_apply_team ;
+      task->SelfType::m_serial       = arg_apply_single ;
+      task->SelfType::m_dep          = (TaskMember**)( ((unsigned char *)task) + derived_size );
+      task->SelfType::m_dep_capacity = arg_dependence_capacity ;
+      task->SelfType::m_shmem_size   = arg_team_shmem ;
+      task->SelfType::m_state        = TASK_STATE_CONSTRUCTING ;
+
+      for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) task->SelfType::m_dep[i] = 0 ;
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  void reschedule();
+  void schedule();
+
+  //----------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs );
+#else
+  KOKKOS_INLINE_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs ) {}
+#endif
+
+  TaskMember * get_dependence( int i ) const ;
+
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence() const
+    { return m_dep_size ; }
+
+  void clear_dependence();
+  void add_dependence( TaskMember * before );
+
+  //----------------------------------------
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+};
+
+/** \brief  A Future< Kokkos::Threads , ResultType > will cast
+ *          from  TaskMember< Kokkos::Threads , void , void >
+ *          to    TaskMember< Kokkos::Threads , ResultType , void >
+ *          to query the result.
+ */
+template< class ResultType >
+class TaskMember< Kokkos::Threads , ResultType , void >
+  : public TaskMember< Kokkos::Threads , void , void >
+{
+public:
+
+  typedef ResultType result_type ;
+
+  result_type  m_result ;
+
+  typedef const result_type & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+  inline
+  TaskMember() : TaskMember< Kokkos::Threads , void , void >(), m_result() {}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+  TaskMember( const TaskMember & ) = delete ;
+  TaskMember & operator = ( const TaskMember & ) = delete ;
+#else
+private:
+  TaskMember( const TaskMember & );
+  TaskMember & operator = ( const TaskMember & );
+#endif
+};
+
+/** \brief  Callback functions will cast
+ *          from  TaskMember< Kokkos::Threads , void , void >
+ *          to    TaskMember< Kokkos::Threads , ResultType , FunctorType >
+ *          to execute work functions.
+ */
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Threads , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Threads , ResultType , void >
+  , public FunctorType
+{
+public:
+  typedef ResultType   result_type ;
+  typedef FunctorType  functor_type ;
+
+  inline
+  TaskMember( const functor_type & arg_functor )
+    : TaskMember< Kokkos::Threads , ResultType , void >()
+    , functor_type( arg_functor )
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+void wait( TaskPolicy< Kokkos::Threads > & );
+
+template<>
+class TaskPolicy< Kokkos::Threads >
+{
+public:
+
+  typedef Kokkos::Threads                      execution_space ;
+  typedef TaskPolicy                           execution_policy ;
+  typedef Kokkos::Impl::ThreadsExecTeamMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< Kokkos::Threads , void , void >  task_root_type ;
+
+  int m_default_dependence_capacity ;
+  int m_team_size ;    ///< Fixed size of a task-team
+
+  template< class FunctorType >
+  static inline
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  static inline
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+public:
+
+  // Valid team sizes are 1,
+  // Threads::pool_size(1) == threads per numa, or
+  // Threads::pool_size(2) == threads per core
+
+  TaskPolicy( const unsigned arg_default_dependence_capacity = 4
+            , const unsigned arg_team_size = 0 /* default from thread pool topology */
+            );
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy & rhs )
+    : m_default_dependence_capacity( rhs.m_default_dependence_capacity )
+    , m_team_size( rhs.m_team_size )
+    {}
+  
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy & rhs
+            , const unsigned arg_default_dependence_capacity )
+    : m_default_dependence_capacity( arg_default_dependence_capacity )
+    , m_team_size( rhs.m_team_size )
+    {}
+
+  TaskPolicy & operator = ( const TaskPolicy &rhs ) {
+    m_default_dependence_capacity = rhs.m_default_dependence_capacity;
+    m_team_size = rhs.m_team_size;
+    return *this;
+  }
+
+  // Create serial-thread task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create( const FunctorType & functor
+        , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type , void >
+          ( functor
+          , task_root_type::function_team_type(0)
+          , & task_root_type::template apply_single< task_type , void >
+          , 0
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          )
+#endif
+        );
+    }
+
+  // Create thread-team task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_team( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type  value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type , void >
+          ( functor
+          , & task_root_type::template apply_team< task_type , void >
+          , task_root_type::function_single_type(0)
+          , Kokkos::Impl::FunctorTeamShmemSize< FunctorType >::value( functor , m_team_size )
+          , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity )
+          )
+#endif
+        );
+    }
+
+  template< class A1 , class A2 , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      after.m_task->add_dependence( before.m_task );
+#endif
+    }
+
+  template< class FunctorType , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->add_dependence( before.m_task ); }
+#else
+    {}
+#endif
+
+
+  template< class ValueType >
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f ) const
+      {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        f.m_task->schedule();
+#endif
+        return f ;
+      }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void respawn( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->reschedule(); }
+#else
+    {}
+#endif
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        get_task_root(task_functor)->get_dependence(i)
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence( const FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return get_task_root(task_functor)->get_dependence(); }
+#else
+    { return 0 ; }
+#endif
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->clear_dependence(); }
+#else
+    {}
+#endif
+
+  //----------------------------------------
+
+  static member_type & member_single();
+
+  friend void wait( TaskPolicy< Kokkos::Threads > & );
+};
+
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
+
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */
+
+
diff --git a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..50168fe3cc2db08069d94f75cb86bb1917f3eafe
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.cpp
@@ -0,0 +1,275 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+bool
+SharedAllocationRecord< void , void >::
+is_sane( SharedAllocationRecord< void , void > * arg_record )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  SharedAllocationRecord * const root = arg_record ? arg_record->m_root : 0 ;
+
+  bool ok = root != 0 && root->m_count == 0 ;
+
+  if ( ok ) {
+    SharedAllocationRecord * root_next = 0 ;
+
+    // Lock the list:
+    while ( ( root_next = Kokkos::atomic_exchange( & root->m_next , zero ) ) == 0 );
+
+    for ( SharedAllocationRecord * rec = root_next ; ok && rec != root ; rec = rec->m_next ) {
+      const bool ok_non_null  = rec && rec->m_prev && ( rec == root || rec->m_next );
+      const bool ok_root      = ok_non_null && rec->m_root == root ;
+      const bool ok_prev_next = ok_non_null && ( rec->m_prev != root ? rec->m_prev->m_next == rec : root_next == rec );
+      const bool ok_next_prev = ok_non_null && rec->m_next->m_prev == rec ;
+      const bool ok_count     = ok_non_null && 0 <= rec->m_count ;
+
+      ok = ok_root && ok_prev_next && ok_next_prev && ok_count ;
+
+if ( ! ok ) {
+  fprintf(stderr,"Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane: rec(0x%.12lx){ m_count(%d) m_root(0x%.12lx) m_next(0x%.12lx) m_prev(0x%.12lx) m_next->m_prev(0x%.12lx) m_prev->m_next(0x%.12lx) }\n"
+        , reinterpret_cast< unsigned long >( rec )
+        , rec->m_count
+        , reinterpret_cast< unsigned long >( rec->m_root )
+        , reinterpret_cast< unsigned long >( rec->m_next )
+        , reinterpret_cast< unsigned long >( rec->m_prev )
+        , reinterpret_cast< unsigned long >( rec->m_next->m_prev )
+        , reinterpret_cast< unsigned long >( rec->m_prev != rec->m_root ? rec->m_prev->m_next : root_next )
+        );
+}
+
+    }
+
+    if ( zero != Kokkos::atomic_exchange( & root->m_next , root_next ) ) {
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed is_sane unlocking");
+    }
+  }
+
+  return ok ; 
+}
+
+SharedAllocationRecord<void,void> *
+SharedAllocationRecord<void,void>::find( SharedAllocationRecord<void,void> * const arg_root , void * const arg_data_ptr )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  SharedAllocationRecord * root_next = 0 ;
+
+  // Lock the list:
+  while ( ( root_next = Kokkos::atomic_exchange( & arg_root->m_next , 0 ) ) == 0 );
+
+  // Iterate searching for the record with this data pointer
+
+  SharedAllocationRecord * r = root_next ;
+
+  while ( ( r != arg_root ) && ( r->data() != arg_data_ptr ) ) { r = r->m_next ; }
+
+  if ( r == arg_root ) { r = 0 ; }
+
+  if ( zero != Kokkos::atomic_exchange( & arg_root->m_next , root_next ) ) {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed locking/unlocking");
+  }
+
+  return r ;
+}
+
+
+/**\brief  Construct and insert into 'arg_root' tracking set.
+ *         use_count is zero.
+ */
+SharedAllocationRecord< void , void >::
+SharedAllocationRecord( SharedAllocationRecord<void,void> * arg_root
+                      , SharedAllocationHeader            * arg_alloc_ptr
+                      , size_t                              arg_alloc_size
+                      , SharedAllocationRecord< void , void >::function_type  arg_dealloc
+                      )
+  : m_alloc_ptr(  arg_alloc_ptr )
+  , m_alloc_size( arg_alloc_size )
+  , m_dealloc(    arg_dealloc )
+  , m_root( arg_root )
+  , m_prev( 0 )
+  , m_next( 0 )
+  , m_count( 0 )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  // Insert into the root double-linked list for tracking
+  //
+  // before:  arg_root->m_next == next ; next->m_prev == arg_root
+  // after:   arg_root->m_next == this ; this->m_prev == arg_root ;
+  //              this->m_next == next ; next->m_prev == this
+
+  m_prev = m_root ;
+
+  // Read root->m_next and lock by setting to zero
+  while ( ( m_next = Kokkos::atomic_exchange( & m_root->m_next , zero ) ) == 0 );
+
+  m_next->m_prev = this ;
+
+  if ( zero != Kokkos::atomic_exchange( & m_root->m_next , this ) ) {
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed locking/unlocking");
+  }
+}
+
+void
+SharedAllocationRecord< void , void >::
+increment( SharedAllocationRecord< void , void > * arg_record )
+{
+  const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , 1 );
+
+  if ( old_count < 0 ) { // Error
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed increment");
+  }
+}
+
+SharedAllocationRecord< void , void > *
+SharedAllocationRecord< void , void >::
+decrement( SharedAllocationRecord< void , void > * arg_record )
+{
+  constexpr static SharedAllocationRecord * zero = 0 ;
+
+  const int old_count = Kokkos::atomic_fetch_add( & arg_record->m_count , -1 );
+
+  if ( old_count == 1 ) {
+
+    // before:  arg_record->m_prev->m_next == arg_record  &&
+    //          arg_record->m_next->m_prev == arg_record
+    //
+    // after:   arg_record->m_prev->m_next == arg_record->m_next  &&
+    //          arg_record->m_next->m_prev == arg_record->m_prev
+
+    SharedAllocationRecord * root_next = 0 ;
+
+    // Lock the list:
+    while ( ( root_next = Kokkos::atomic_exchange( & arg_record->m_root->m_next , 0 ) ) == 0 );
+
+    arg_record->m_next->m_prev = arg_record->m_prev ;
+
+    if ( root_next != arg_record ) {
+      arg_record->m_prev->m_next = arg_record->m_next ;
+    }
+    else {
+      // before:  arg_record->m_root == arg_record->m_prev
+      // after:   arg_record->m_root == arg_record->m_next
+      root_next = arg_record->m_next ; 
+    }
+
+    // Unlock the list:
+    if ( zero != Kokkos::atomic_exchange( & arg_record->m_root->m_next , root_next ) ) {
+      Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed decrement unlocking");
+    }
+
+    arg_record->m_next = 0 ;
+    arg_record->m_prev = 0 ;
+
+    function_type d = arg_record->m_dealloc ;
+    (*d)( arg_record );
+    arg_record = 0 ;
+  }
+  else if ( old_count < 1 ) { // Error
+    Kokkos::Impl::throw_runtime_exception("Kokkos::Experimental::Impl::SharedAllocationRecord failed decrement count");
+  }
+
+  return arg_record ;
+}
+
+void
+SharedAllocationRecord< void , void >::
+print_host_accessible_records( std::ostream & s
+                             , const char * const space_name
+                             , const SharedAllocationRecord * const root
+                             , const bool detail )
+{
+  const SharedAllocationRecord< void , void > * r = root ;
+
+  char buffer[256] ;
+
+  if ( detail ) {
+    do {
+
+      snprintf( buffer , 256 , "%s addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"
+              , space_name
+              , reinterpret_cast<unsigned long>( r )
+              , reinterpret_cast<unsigned long>( r->m_prev )
+              , reinterpret_cast<unsigned long>( r->m_next )
+              , reinterpret_cast<unsigned long>( r->m_alloc_ptr )
+              , r->m_alloc_size
+              , r->m_count
+              , reinterpret_cast<unsigned long>( r->m_dealloc )
+              , r->m_alloc_ptr->m_label
+              );
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != root );
+  }
+  else {
+    do {
+      if ( r->m_alloc_ptr ) {
+
+        snprintf( buffer , 256 , "%s [ 0x%.12lx + %ld ] %s\n"
+                , space_name
+                , reinterpret_cast< unsigned long >( r->data() )
+                , r->size()
+                , r->m_alloc_ptr->m_label
+                );
+      }
+      else {
+        snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name );
+      }
+      std::cout << buffer ;
+      r = r->m_next ;
+    } while ( r != root );
+  }
+}
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+
diff --git a/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..d9491b55329ca561af8df7e540848b158e3da4fe
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_SharedAlloc.hpp
@@ -0,0 +1,287 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class MemorySpace = void , class DestroyFunctor = void >
+class SharedAllocationRecord ;
+
+class SharedAllocationHeader {
+private:
+
+  typedef SharedAllocationRecord<void,void>  Record ;
+
+  static constexpr unsigned maximum_label_length = ( 1u << 7 /* 128 */ ) - sizeof(Record*);
+
+  template< class , class > friend class SharedAllocationRecord ;
+
+  Record * m_record ;
+  char     m_label[ maximum_label_length ];
+
+public:
+
+  /* Given user memory get pointer to the header */
+  KOKKOS_INLINE_FUNCTION static
+  const SharedAllocationHeader * get_header( void * alloc_ptr )
+    { return reinterpret_cast<SharedAllocationHeader*>( reinterpret_cast<char*>(alloc_ptr) - sizeof(SharedAllocationHeader) ); }
+};
+
+template<>
+class SharedAllocationRecord< void , void > {
+protected:
+
+  static_assert( sizeof(SharedAllocationHeader) == ( 1u << 7 /* 128 */ ) , "sizeof(SharedAllocationHeader) != 128" );
+
+  template< class , class > friend class SharedAllocationRecord ;
+
+  typedef void (* function_type )( SharedAllocationRecord<void,void> * );
+
+  SharedAllocationHeader * const m_alloc_ptr ;
+  size_t                   const m_alloc_size ;
+  function_type            const m_dealloc ;
+  SharedAllocationRecord * const m_root ;
+  SharedAllocationRecord *       m_prev ;
+  SharedAllocationRecord *       m_next ;
+  int                            m_count ;
+
+  SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
+  SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
+
+  /**\brief  Construct and insert into 'arg_root' tracking set.
+   *         use_count is zero.
+   */
+  SharedAllocationRecord( SharedAllocationRecord * arg_root
+                        , SharedAllocationHeader * arg_alloc_ptr
+                        , size_t                   arg_alloc_size
+                        , function_type            arg_dealloc
+                        );
+
+public:
+
+  ~SharedAllocationRecord() = default ;
+
+  constexpr SharedAllocationRecord()
+    : m_alloc_ptr( 0 )
+    , m_alloc_size( 0 )
+    , m_dealloc( 0 )
+    , m_root( this )
+    , m_prev( this )
+    , m_next( this )
+    , m_count( 0 )
+    {}
+
+  static constexpr unsigned maximum_label_length = SharedAllocationHeader::maximum_label_length ;
+
+  KOKKOS_INLINE_FUNCTION
+  const SharedAllocationHeader * head() const { return m_alloc_ptr ; }
+
+  /* User's memory begins at the end of the header */
+  KOKKOS_INLINE_FUNCTION
+  void * data() const { return reinterpret_cast<void*>( m_alloc_ptr + 1 ); }
+
+  /* User's memory begins at the end of the header */
+  constexpr size_t size() const { return m_alloc_size - sizeof(SharedAllocationHeader) ; }
+
+  /* Cannot be 'constexpr' because 'm_count' is volatile */
+  int use_count() const { return m_count ; }
+
+  /* Increment use count */
+  static void increment( SharedAllocationRecord * );
+
+  /* Decrement use count. If 1->0 then remove from the tracking list and invoke m_dealloc */
+  static SharedAllocationRecord * decrement( SharedAllocationRecord * );
+
+  /* Given a root record and data pointer find the record */
+  static SharedAllocationRecord * find( SharedAllocationRecord * const , void * const );
+
+  /*  Sanity check for the whole set of records to which the input record belongs.
+   *  Locks the set's insert/erase operations until the sanity check is complete.
+   */
+  static bool is_sane( SharedAllocationRecord * );
+
+  /*  Print host-accessible records */
+  static void print_host_accessible_records( std::ostream &
+                                           , const char * const space_name
+                                           , const SharedAllocationRecord * const root
+                                           , const bool detail );
+};
+
+/*
+ *  Memory space specialization of SharedAllocationRecord< Space , void > requires :
+ *
+ *  SharedAllocationRecord< Space , void > : public SharedAllocationRecord< void , void >
+ *  {
+ *    // delete allocated user memory via static_cast to this type.
+ *    static void deallocate( const SharedAllocationRecord<void,void> * );
+ *    Space m_space ;
+ *  }
+ */
+
+template< class MemorySpace , class DestroyFunctor >
+class SharedAllocationRecord : public SharedAllocationRecord< MemorySpace , void >
+{
+private:
+
+  static void deallocate( SharedAllocationRecord<void,void> * record_ptr )
+    { delete static_cast<SharedAllocationRecord<MemorySpace,DestroyFunctor>*>(record_ptr); }
+
+  SharedAllocationRecord( const MemorySpace & arg_space
+                        , const std::string & arg_label
+                        , const size_t        arg_alloc
+                        )
+    /*  Allocate user memory as [ SharedAllocationHeader , user_memory ] */
+    : SharedAllocationRecord< MemorySpace , void >( arg_space , arg_label , arg_alloc , & deallocate )
+    , m_destroy()
+    {}
+
+  ~SharedAllocationRecord() { m_destroy.destroy_shared_allocation(); }
+
+public:
+
+  DestroyFunctor  m_destroy ;
+
+  // Allocate with a zero use count.  Incrementing the use count from zero to one
+  // inserts the record into the tracking list.  Decrementing the count from one to zero
+  // removes from the trakcing list and deallocates.
+  KOKKOS_INLINE_FUNCTION static
+  SharedAllocationRecord * allocate( const MemorySpace & arg_space 
+                                   , const std::string & arg_label
+                                   , const size_t        arg_alloc
+                                   )
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      return new SharedAllocationRecord( arg_space , arg_label , arg_alloc );
+#else
+      return (SharedAllocationRecord *) 0 ;
+#endif
+    }
+};
+
+union SharedAllocationTracker {
+private:
+
+  typedef SharedAllocationRecord<void,void>  Record ;
+
+  enum : unsigned long {
+    DO_NOT_DEREF_FLAG = 0x01ul
+  };
+
+  // The allocation record resides in Host memory space
+  Record * m_record ;
+  unsigned long m_record_bits;
+
+  KOKKOS_INLINE_FUNCTION
+  static Record * disable( Record * rec )
+    { return reinterpret_cast<Record*>( reinterpret_cast<unsigned long>( rec ) & DO_NOT_DEREF_FLAG ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void increment() const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::increment( m_record );
+#endif
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void decrement() const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+       if ( ! ( m_record_bits & DO_NOT_DEREF_FLAG ) ) Record::decrement( m_record );
+#endif
+    }
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr SharedAllocationTracker() : m_record_bits( DO_NOT_DEREF_FLAG ) {}
+
+  template< class MemorySpace >
+  constexpr
+  SharedAllocationRecord< MemorySpace , void > & get_record() const
+    { return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); }
+
+  template< class MemorySpace >
+  std::string get_label() const
+    { return static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record )->get_label(); }
+
+  KOKKOS_INLINE_FUNCTION
+  SharedAllocationTracker( Record * arg_record )
+    : m_record( arg_record ) { increment(); }
+
+  KOKKOS_INLINE_FUNCTION
+  ~SharedAllocationTracker() { decrement(); }
+
+  KOKKOS_INLINE_FUNCTION
+  SharedAllocationTracker( const SharedAllocationTracker & rhs )
+    : m_record( rhs.m_record ) { increment(); }
+
+  KOKKOS_INLINE_FUNCTION
+  SharedAllocationTracker( SharedAllocationTracker && rhs )
+    : m_record( rhs.m_record ) { rhs.m_record_bits = DO_NOT_DEREF_FLAG ; }
+
+  KOKKOS_INLINE_FUNCTION
+  SharedAllocationTracker & operator = ( const SharedAllocationTracker & rhs )
+    {
+      decrement();
+      m_record = rhs.m_record ;
+      increment();
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  SharedAllocationTracker & operator = ( SharedAllocationTracker && rhs )
+    {
+      m_record = rhs.m_record ;
+      rhs.m_record_bits = DO_NOT_DEREF_FLAG ;
+      return *this ;
+    }
+};
+
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+
diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewAllocProp.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewAllocProp.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..348ccaf5ed3bcd8345e05d3880e6cc34badf017b
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewAllocProp.hpp
@@ -0,0 +1,416 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_IMPL_VIEW_ALLOC_PROP_HPP
+#define KOKKOS_EXPERIMENTAL_IMPL_VIEW_ALLOC_PROP_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct WithoutInitializing_t {};
+struct AllowPadding_t {};
+
+template< class ... Parameters >
+struct ViewAllocProp ;
+
+template<>
+struct ViewAllocProp<> {
+
+  struct NullSpace {};
+
+  typedef std::false_type  allow_padding_t ;
+  typedef std::true_type   initialize_t ;
+  typedef NullSpace        memory_space ;
+  typedef NullSpace        execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp()
+    : label()
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+
+  ViewAllocProp( const std::string & arg_label )
+    : label( arg_label )
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ... Parameters >
+struct ViewAllocProp< const char * , Parameters ... >
+{
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef typename base_prop_type::memory_space     memory_space ;
+  typedef typename base_prop_type::execution_space  execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const char * const arg_label , Parameters ... arg_param )
+    : label( arg_label )
+    , memory( base_prop_type( arg_param ... ).memory )
+    , execution( base_prop_type( arg_param ... ).execution )
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ... Parameters >
+struct ViewAllocProp< std::string , Parameters ... >
+{
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef typename base_prop_type::memory_space     memory_space ;
+  typedef typename base_prop_type::execution_space  execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const std::string & arg_label , Parameters ... arg_param )
+    : label( arg_label )
+    , memory( base_prop_type( arg_param ... ).memory )
+    , execution( base_prop_type( arg_param ... ).execution )
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ... Parameters >
+struct ViewAllocProp< WithoutInitializing_t , Parameters ... >
+{
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef std::false_type                           initialize_t ;
+  typedef typename base_prop_type::memory_space     memory_space ;
+  typedef typename base_prop_type::execution_space  execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const WithoutInitializing_t & , Parameters ... arg_param )
+    : label( base_prop_type( arg_param ... ).label )
+    , memory( base_prop_type( arg_param ... ).memory )
+    , execution( base_prop_type( arg_param ... ).execution )
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ... Parameters >
+struct ViewAllocProp< AllowPadding_t , Parameters ... >
+{
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef std::true_type                            allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef typename base_prop_type::memory_space     memory_space ;
+  typedef typename base_prop_type::execution_space  execution_space ;
+
+  const std::string label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const AllowPadding_t & , Parameters ... arg_param )
+    : label( base_prop_type( arg_param ... ).label )
+    , memory( base_prop_type( arg_param ... ).memory )
+    , execution( base_prop_type( arg_param ... ).execution )
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class Space , class ... Parameters >
+struct ViewAllocProp< Space , Parameters ... >
+{
+  enum { is_exec = Kokkos::Impl::is_execution_space< Space >::value };
+  enum { is_mem  = Kokkos::Impl::is_memory_space< Space >::value };
+
+  static_assert( is_exec || is_mem , "View allocation given unknown parameter" );
+
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef typename std::conditional< is_mem  , Space , typename base_prop_type::memory_space >::type     memory_space ;
+  typedef typename std::conditional< is_exec , Space , typename base_prop_type::execution_space >::type  execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  // Templated so that 'base_prop_type( args ... ).execution'
+  // is not used unless arg_space == memory_space.
+  template< class ... Args >
+  ViewAllocProp( const memory_space & arg_space , Args ... args )
+    : label( base_prop_type( args ... ).label )
+    , memory( arg_space )
+    , execution( base_prop_type( args ... ).execution )
+    , allow_padding()
+    , initialize()
+    {}
+
+  // Templated so that 'base_prop_type( args ... ).memory'
+  // is not used unless arg_space == execution_space.
+  template< class ... Args >
+  ViewAllocProp( const execution_space & arg_space , Args ... args )
+    : label( base_prop_type( args ... ).label )
+    , memory( base_prop_type( args ... ).memory )
+    , execution( arg_space )
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ExecSpace , class MemSpace >
+struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace > , std::string >
+{
+  typedef ViewAllocProp<>  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef MemSpace   memory_space ;
+  typedef ExecSpace  execution_space ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const std::string & arg_label )
+    : label( arg_label )
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ExecSpace , class MemSpace , unsigned N >
+struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace > , char[N] >
+{
+  typedef ViewAllocProp<>  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef MemSpace   memory_space ;
+  typedef ExecSpace  execution_space  ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const char * const arg_label )
+    : label( arg_label )
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+
+// Deprecate in favor of view_alloc( Kokkos::WithoutInitializing )
+template< class ExecSpace , class MemSpace >
+struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace >
+                    , Kokkos::ViewAllocateWithoutInitializing
+                    >
+{
+  typedef ViewAllocProp<>  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef std::false_type                           initialize_t ;
+  typedef MemSpace   memory_space ;
+  typedef ExecSpace  execution_space  ;
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  ViewAllocProp( const Kokkos::ViewAllocateWithoutInitializing & arg )
+    : label( arg.label )
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+template< class ExecSpace , class MemSpace , class ... Parameters >
+struct ViewAllocProp< Kokkos::Device< ExecSpace , MemSpace >
+                    , ViewAllocProp< Parameters ... >
+                    >
+{
+  typedef ViewAllocProp< Parameters ... >  base_prop_type ;
+
+  typedef typename base_prop_type::allow_padding_t  allow_padding_t ;
+  typedef typename base_prop_type::initialize_t     initialize_t ;
+  typedef MemSpace  memory_space ;
+
+  typedef
+    typename std::conditional
+      < Kokkos::Impl::is_execution_space< typename base_prop_type::execution_space >::value
+      , typename base_prop_type::execution_space
+      , ExecSpace
+      >::type  execution_space ;
+
+  static_assert( std::is_same< typename base_prop_type::memory_space , ViewAllocProp<>::NullSpace >::value ||
+                 std::is_same< typename base_prop_type::memory_space , memory_space >::value
+               , "View allocation given incompatible memory space" );
+
+  static_assert( Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename execution_space::memory_space
+                                                                  , memory_space >::value
+               , "View allocation given incompatible execution space" );
+
+  const std::string      label ;
+  const memory_space     memory ;
+  const execution_space  execution ;
+  const allow_padding_t  allow_padding ;
+  const initialize_t     initialize ;
+
+  // If the input properties have a memory or execution space then copy construct those spaces
+  // otherwise default construct those spaces.
+
+  template< class P >
+  ViewAllocProp( const P & arg_prop
+               , typename std::enable_if
+                   < std::is_same< P , base_prop_type >::value &&
+                     Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
+                     Kokkos::Impl::is_execution_space< typename P::memory_space >::value
+                   >::type * = 0 )
+    : label( arg_prop.label )
+    , memory( arg_prop.memory )
+    , execution( arg_prop.execution )
+    , allow_padding()
+    , initialize()
+    {}
+
+  template< class P >
+  ViewAllocProp( const P & arg_prop
+               , typename std::enable_if
+                   < std::is_same< P , base_prop_type >::value &&
+                     Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
+                     ! Kokkos::Impl::is_execution_space< typename P::execution_space >::value
+                   >::type * = 0 )
+    : label( arg_prop.label )
+    , memory( arg_prop.memory )
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+
+  template< class P >
+  ViewAllocProp( const P & arg_prop
+               , typename std::enable_if
+                   < std::is_same< P , base_prop_type >::value &&
+                     ! Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
+                     Kokkos::Impl::is_execution_space< typename P::execution_space >::value
+                   >::type * = 0 )
+    : label( arg_prop.label )
+    , memory()
+    , execution( arg_prop.execution )
+    , allow_padding()
+    , initialize()
+    {}
+
+  template< class P >
+  ViewAllocProp( const P & arg_prop
+               , typename std::enable_if
+                   < std::is_same< P , base_prop_type >::value &&
+                     ! Kokkos::Impl::is_memory_space< typename P::memory_space >::value &&
+                     ! Kokkos::Impl::is_execution_space< typename P::execution_space >::value
+                   >::type * = 0 )
+    : label( arg_prop.label )
+    , memory()
+    , execution()
+    , allow_padding()
+    , initialize()
+    {}
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
diff --git a/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..bd2b4c675bd332f8eeea85bf52582c2e90fd02a8
--- /dev/null
+++ b/lib/kokkos/core/src/impl/KokkosExp_ViewMapping.hpp
@@ -0,0 +1,2683 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP
+#define KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP
+
+#include <type_traits>
+#include <initializer_list>
+
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Atomic_View.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ExecPolicy > class ParallelFor ;
+
+}} /* namespace Kokkos::Impl */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< long sN0 = -1
+        , long sN1 = -1
+        , long sN2 = -1
+        , long sN3 = -1
+        , long sN4 = -1
+        , long sN5 = -1
+        , long sN6 = -1
+        , long sN7 = -1
+        >
+struct ViewDimension {
+
+  enum { rank = ( sN0 < 0 ? 0 :
+                ( sN1 < 0 ? 1 :
+                ( sN2 < 0 ? 2 :
+                ( sN3 < 0 ? 3 :
+                ( sN4 < 0 ? 4 :
+                ( sN5 < 0 ? 5 :
+                ( sN6 < 0 ? 6 :
+                ( sN7 < 0 ? 7 : 8 )))))))) };
+  enum { rank_dynamic = 0 };
+
+  enum { N0 = 0 < sN0 ? sN0 : 1 };
+  enum { N1 = 0 < sN1 ? sN1 : 1 };
+  enum { N2 = 0 < sN2 ? sN2 : 1 };
+  enum { N3 = 0 < sN3 ? sN3 : 1 };
+  enum { N4 = 0 < sN4 ? sN4 : 1 };
+  enum { N5 = 0 < sN5 ? sN5 : 1 };
+  enum { N6 = 0 < sN6 ? sN6 : 1 };
+  enum { N7 = 0 < sN7 ? sN7 : 1 };
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewDimension( size_t   , unsigned , unsigned , unsigned 
+                         , unsigned , unsigned , unsigned , unsigned ) {}
+};
+
+template< long sN1
+        , long sN2
+        , long sN3
+        , long sN4
+        , long sN5
+        , long sN6
+        , long sN7
+        >
+struct ViewDimension< 0, sN1, sN2, sN3, sN4, sN5, sN6, sN7 > {
+
+  enum { rank = ( sN1 < 0 ? 1 :
+                ( sN2 < 0 ? 2 :
+                ( sN3 < 0 ? 3 :
+                ( sN4 < 0 ? 4 :
+                ( sN5 < 0 ? 5 :
+                ( sN6 < 0 ? 6 :
+                ( sN7 < 0 ? 7 : 8 ))))))) };
+  enum { rank_dynamic = 1 };
+
+  size_t N0 ; /* When 1 == rank_dynamic allow N0 >= 2^32 */
+  enum { N1 = 0 < sN1 ? sN1 : 1 };
+  enum { N2 = 0 < sN2 ? sN2 : 1 };
+  enum { N3 = 0 < sN3 ? sN3 : 1 };
+  enum { N4 = 0 < sN4 ? sN4 : 1 };
+  enum { N5 = 0 < sN5 ? sN5 : 1 };
+  enum { N6 = 0 < sN6 ? sN6 : 1 };
+  enum { N7 = 0 < sN7 ? sN7 : 1 };
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewDimension( size_t   aN0 , unsigned , unsigned , unsigned
+                         , unsigned , unsigned , unsigned , unsigned )
+    : N0( aN0 ) {}
+};
+
+template< long sN2
+        , long sN3
+        , long sN4
+        , long sN5
+        , long sN6
+        , long sN7
+        >
+struct ViewDimension< 0, 0, sN2, sN3, sN4, sN5, sN6, sN7 > {
+
+  enum { rank = ( sN2 < 0 ? 2 :
+                ( sN3 < 0 ? 3 :
+                ( sN4 < 0 ? 4 :
+                ( sN5 < 0 ? 5 :
+                ( sN6 < 0 ? 6 :
+                ( sN7 < 0 ? 7 : 8 )))))) };
+  enum { rank_dynamic = 2 };
+
+  size_t N0 ; /* When 2 == rank_dynamic allow N0 >= 2^32 */
+  size_t N1 ; /* When 2 == rank_dynamic allow N1 >= 2^32 */
+  enum { N2 = 0 < sN2 ? sN2 : 1 };
+  enum { N3 = 0 < sN3 ? sN3 : 1 };
+  enum { N4 = 0 < sN4 ? sN4 : 1 };
+  enum { N5 = 0 < sN5 ? sN5 : 1 };
+  enum { N6 = 0 < sN6 ? sN6 : 1 };
+  enum { N7 = 0 < sN7 ? sN7 : 1 };
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewDimension( size_t   aN0 , unsigned aN1 , unsigned , unsigned
+                         , unsigned , unsigned , unsigned , unsigned )
+    : N0( aN0 ) , N1( aN1 ) {}
+};
+
+template< long sN3
+        , long sN4
+        , long sN5
+        , long sN6
+        , long sN7
+        >
+struct ViewDimension< 0, 0, 0, sN3, sN4, sN5, sN6, sN7 > {
+
+  enum { rank = ( sN3 < 0 ? 3 :
+                ( sN4 < 0 ? 4 :
+                ( sN5 < 0 ? 5 :
+                ( sN6 < 0 ? 6 :
+                ( sN7 < 0 ? 7 : 8 ))))) };
+  enum { rank_dynamic = 3 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  enum { N3 = 0 < sN3 ? sN3 : 1 };
+  enum { N4 = 0 < sN4 ? sN4 : 1 };
+  enum { N5 = 0 < sN5 ? sN5 : 1 };
+  enum { N6 = 0 < sN6 ? sN6 : 1 };
+  enum { N7 = 0 < sN7 ? sN7 : 1 };
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewDimension( size_t   aN0 , unsigned aN1 , unsigned aN2 , unsigned
+                         , unsigned , unsigned , unsigned , unsigned )
+    : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) {}
+};
+
+template< long sN4
+        , long sN5
+        , long sN6
+        , long sN7
+        >
+struct ViewDimension< 0, 0, 0, 0, sN4, sN5, sN6, sN7 > {
+
+  enum { rank = ( sN4 < 0 ? 4 :
+                ( sN5 < 0 ? 5 :
+                ( sN6 < 0 ? 6 :
+                ( sN7 < 0 ? 7 : 8 )))) };
+  enum { rank_dynamic = 4 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  enum { N4 = 0 < sN4 ? sN4 : 1 };
+  enum { N5 = 0 < sN5 ? sN5 : 1 };
+  enum { N6 = 0 < sN6 ? sN6 : 1 };
+  enum { N7 = 0 < sN7 ? sN7 : 1 };
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewDimension( size_t   aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3
+                         , unsigned , unsigned , unsigned , unsigned )
+    : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) , N3( aN3 ) {}
+};
+
+template< long sN5
+        , long sN6
+        , long sN7
+        >
+struct ViewDimension< 0, 0, 0, 0, 0, sN5, sN6, sN7 > {
+
+  enum { rank = ( sN5 < 0 ? 5 :
+                ( sN6 < 0 ? 6 :
+                ( sN7 < 0 ? 7 : 8 ))) };
+  enum { rank_dynamic = 5 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  enum { N5 = 0 < sN5 ? sN5 : 1 };
+  enum { N6 = 0 < sN6 ? sN6 : 1 };
+  enum { N7 = 0 < sN7 ? sN7 : 1 };
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewDimension( size_t   aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3
+                         , unsigned aN4 , unsigned , unsigned , unsigned )
+    : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) , N3( aN3 ) , N4( aN4 ) {}
+};
+
+template< long sN6
+        , long sN7
+        >
+struct ViewDimension< 0, 0, 0, 0, 0, 0, sN6, sN7 > {
+
+  enum { rank = ( sN6 < 0 ? 6 :
+                ( sN7 < 0 ? 7 : 8 )) };
+  enum { rank_dynamic = 6 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  enum { N6 = 0 < sN6 ? sN6 : 1 };
+  enum { N7 = 0 < sN7 ? sN7 : 1 };
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewDimension( size_t   aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3
+                         , unsigned aN4 , unsigned aN5 , unsigned , unsigned )
+    : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) , N3( aN3 ) , N4( aN4 ) , N5( aN5 ) {}
+};
+
+template< long sN7 >
+struct ViewDimension< 0, 0, 0, 0, 0, 0, 0, sN7 > {
+
+  enum { rank = ( sN7 < 0 ? 7 : 8 ) };
+  enum { rank_dynamic = 7 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+  enum { N7 = 0 < sN7 ? sN7 : 1 };
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewDimension( size_t   aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3
+                         , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned )
+    : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) , N3( aN3 ) , N4( aN4 ) , N5( aN5 ) , N6( aN6 ) {}
+};
+
+template<>
+struct ViewDimension< 0, 0, 0, 0, 0, 0, 0, 0 > {
+
+  enum { rank = 8 };
+  enum { rank_dynamic = 8 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+  unsigned N7 ;
+
+  ViewDimension() = default ;
+  ViewDimension( const ViewDimension & ) = default ;
+  ViewDimension & operator = ( const ViewDimension & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewDimension( size_t   aN0 , unsigned aN1 , unsigned aN2 , unsigned aN3
+                         , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned aN7 )
+    : N0( aN0 ) , N1( aN1 ) , N2( aN2 ) , N3( aN3 ) , N4( aN4 ) , N5( aN5 ) , N6( aN6 ) , N7( aN7 ) {}
+};
+
+//----------------------------------------------------------------------------
+
+template< class DstDim , class SrcDim >
+struct ViewDimensionAssignable ;
+
+template< long dN0 , long dN1 , long dN2 , long dN3 , long dN4 , long dN5 , long dN6 , long dN7 
+        , long sN0 , long sN1 , long sN2 , long sN3 , long sN4 , long sN5 , long sN6 , long sN7 >
+struct ViewDimensionAssignable< ViewDimension<dN0,dN1,dN2,dN3,dN4,dN5,dN6,dN7>
+                              , ViewDimension<sN0,sN1,sN2,sN3,sN4,sN5,sN6,sN7> >
+{
+  typedef ViewDimension<dN0,dN1,dN2,dN3,dN4,dN5,dN6,dN7>  dst ;
+  typedef ViewDimension<sN0,sN1,sN2,sN3,sN4,sN5,sN6,sN7>  src ;
+
+  enum { value = dst::rank == src::rank &&
+                 dst::rank_dynamic >= src::rank_dynamic &&
+                 ( 0 < dst::rank_dynamic || dN0 == sN0 ) &&
+                 ( 1 < dst::rank_dynamic || dN1 == sN1 ) &&
+                 ( 2 < dst::rank_dynamic || dN2 == sN2 ) &&
+                 ( 3 < dst::rank_dynamic || dN3 == sN3 ) &&
+                 ( 4 < dst::rank_dynamic || dN4 == sN4 ) &&
+                 ( 5 < dst::rank_dynamic || dN5 == sN5 ) &&
+                 ( 6 < dst::rank_dynamic || dN6 == sN6 ) &&
+                 ( 7 < dst::rank_dynamic || dN7 == sN7 ) };
+};
+
+//----------------------------------------------------------------------------
+
+template< class Dim , unsigned N , unsigned R = Dim::rank_dynamic >
+struct ViewDimensionInsert ;
+
+template< class Dim , unsigned N >
+struct ViewDimensionInsert< Dim , N , 0 >
+{
+  typedef ViewDimension< N
+                       , 0 < Dim::rank ? Dim::N0 : -1 
+                       , 1 < Dim::rank ? Dim::N1 : -1 
+                       , 2 < Dim::rank ? Dim::N2 : -1 
+                       , 3 < Dim::rank ? Dim::N3 : -1 
+                       , 4 < Dim::rank ? Dim::N4 : -1 
+                       , 5 < Dim::rank ? Dim::N5 : -1 
+                       , 6 < Dim::rank ? Dim::N6 : -1
+                       >  type ;
+};
+
+template< class Dim , unsigned N >
+struct ViewDimensionInsert< Dim , N , 1 >
+{
+  typedef ViewDimension< 0 , N
+                       , 1 < Dim::rank ? Dim::N1 : -1 
+                       , 2 < Dim::rank ? Dim::N2 : -1 
+                       , 3 < Dim::rank ? Dim::N3 : -1 
+                       , 4 < Dim::rank ? Dim::N4 : -1 
+                       , 5 < Dim::rank ? Dim::N5 : -1 
+                       , 6 < Dim::rank ? Dim::N6 : -1
+                       >  type ;
+};
+
+template< class Dim , unsigned N >
+struct ViewDimensionInsert< Dim , N , 2 >
+{
+  typedef ViewDimension< 0 , 0 , N
+                       , 2 < Dim::rank ? Dim::N2 : -1 
+                       , 3 < Dim::rank ? Dim::N3 : -1 
+                       , 4 < Dim::rank ? Dim::N4 : -1 
+                       , 5 < Dim::rank ? Dim::N5 : -1 
+                       , 6 < Dim::rank ? Dim::N6 : -1
+                       >  type ;
+};
+
+template< class Dim , unsigned N >
+struct ViewDimensionInsert< Dim , N , 3 >
+{
+  typedef ViewDimension< 0 , 0 , 0 , N
+                       , 3 < Dim::rank ? Dim::N3 : -1 
+                       , 4 < Dim::rank ? Dim::N4 : -1 
+                       , 5 < Dim::rank ? Dim::N5 : -1 
+                       , 6 < Dim::rank ? Dim::N6 : -1
+                       >  type ;
+};
+
+template< class Dim , unsigned N >
+struct ViewDimensionInsert< Dim , N , 4 >
+{
+  typedef ViewDimension< 0 , 0 , 0 , 0 , N
+                       , 4 < Dim::rank ? Dim::N4 : -1 
+                       , 5 < Dim::rank ? Dim::N5 : -1 
+                       , 6 < Dim::rank ? Dim::N6 : -1
+                       >  type ;
+};
+
+template< class Dim , unsigned N >
+struct ViewDimensionInsert< Dim , N , 5 >
+{
+  typedef ViewDimension< 0 , 0 , 0 , 0 , 0 , N
+                       , 5 < Dim::rank ? Dim::N5 : -1 
+                       , 6 < Dim::rank ? Dim::N6 : -1
+                       >  type ;
+};
+
+template< class Dim , unsigned N >
+struct ViewDimensionInsert< Dim , N , 6 >
+{
+  typedef ViewDimension< 0 , 0 , 0 , 0 , 0 , 0 , N
+                       , 6 < Dim::rank ? Dim::N6 : -1
+                       >  type ;
+};
+
+template< class Dim , unsigned N >
+struct ViewDimensionInsert< Dim , N , 7 >
+{
+  typedef ViewDimension< 0 , 0 , 0 , 0 , 0 , 0 , 0 , N >  type ;
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  Analyze the array dimensions defined by a Kokkos::View data type.
+ *
+ *  It is presumed that the data type can be mapped down to a multidimensional
+ *  array of an intrinsic scalar numerical type (double, float, int, ... ).
+ *  The 'value_type' of an array may be an embedded aggregate type such
+ *  as a fixed length array 'Array<T,N>'.
+ *  In this case the 'array_intrinsic_type' represents the
+ *  underlying array of intrinsic scalar numerical type.
+ *
+ *  The embedded aggregate type must have an AnalyzeShape specialization
+ *  to map it down to a shape and intrinsic scalar numerical type.
+ */
+template< class T >
+struct ViewDataAnalysis
+{
+  typedef void  specialize ; // No specialization
+
+  typedef ViewDimension<>  dimension ;
+
+  typedef T  type ;
+  typedef T  value_type ;
+  typedef T  array_scalar_type ;
+
+  typedef typename std::add_const< T >::type  const_type ;
+  typedef typename std::add_const< T >::type  const_value_type ;
+  typedef typename std::add_const< T >::type  const_array_scalar_type ;
+
+  typedef typename std::remove_const< T >::type  non_const_type ;
+  typedef typename std::remove_const< T >::type  non_const_value_type ;
+  typedef typename std::remove_const< T >::type  non_const_array_scalar_type ;
+};
+
+template< class T >
+struct ViewDataAnalysis< T * >
+{
+private:
+
+  typedef ViewDataAnalysis< T >  nested ;
+
+public:
+
+  typedef typename nested::specialize  specialize ;
+
+  typedef typename ViewDimensionInsert< typename nested::dimension , 0 >::type  dimension ;
+
+  typedef typename nested::type *               type ;
+  typedef typename nested::value_type           value_type ;
+  typedef typename nested::array_scalar_type *  array_scalar_type ;
+
+  typedef typename nested::const_type *               const_type ;
+  typedef typename nested::const_value_type           const_value_type ;
+  typedef typename nested::const_array_scalar_type *  const_array_scalar_type ;
+
+  typedef typename nested::non_const_type *               non_const_type ;
+  typedef typename nested::non_const_value_type           non_const_value_type ;
+  typedef typename nested::non_const_array_scalar_type *  non_const_array_scalar_type ;
+};
+
+template< class T >
+struct ViewDataAnalysis< T [] >
+{
+private:
+
+  typedef ViewDataAnalysis< T >  nested ;
+
+public:
+
+  typedef typename nested::specialize  specialize ;
+
+  typedef typename ViewDimensionInsert< typename nested::dimension , 0 >::type  dimension ;
+
+  typedef typename nested::type               type [] ;
+  typedef typename nested::value_type         value_type ;
+  typedef typename nested::array_scalar_type  array_scalar_type [] ;
+
+  typedef typename nested::const_type               const_type [] ;
+  typedef typename nested::const_value_type         const_value_type ;
+  typedef typename nested::const_array_scalar_type  const_array_scalar_type [] ;
+
+  typedef typename nested::non_const_type               non_const_type [] ;
+  typedef typename nested::non_const_value_type         non_const_value_type ;
+  typedef typename nested::non_const_array_scalar_type  non_const_array_scalar_type [] ;
+};
+
+template< class T , unsigned N >
+struct ViewDataAnalysis< T[N] >
+{
+private:
+
+  typedef ViewDataAnalysis< T >  nested ;
+
+public:
+
+  typedef typename nested::specialize  specialize ;
+
+  typedef typename ViewDimensionInsert< typename nested::dimension , N >::type  dimension ;
+
+  typedef typename nested::type               type [N] ;
+  typedef typename nested::value_type         value_type ;
+  typedef typename nested::array_scalar_type  array_scalar_type [N] ;
+
+  typedef typename nested::const_type               const_type [N] ;
+  typedef typename nested::const_value_type         const_value_type ;
+  typedef typename nested::const_array_scalar_type  const_array_scalar_type [N] ;
+
+  typedef typename nested::non_const_type               non_const_type [N] ;
+  typedef typename nested::non_const_value_type         non_const_value_type ;
+  typedef typename nested::non_const_array_scalar_type  non_const_array_scalar_type [N] ;
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template < class Dimension , class Layout , typename Enable = void >
+struct ViewOffset ;
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutLeft
+                 , typename std::enable_if<( 1 >= Dimension::rank
+                                             ||
+                                             0 == Dimension::rank_dynamic
+                                           )>::type >
+{
+  typedef size_t             size_type ;
+  typedef Dimension          dimension_type ;
+  typedef Kokkos::LayoutLeft array_layout ;
+
+  dimension_type m_dim ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i0 + m_dim.N0 * i1 ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i0 + m_dim.N0 * ( i1 + m_dim.N1 * i2 );
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * i3 ));
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * i4 )));
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * i5 ))));
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i0 + m_dim.N0 * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * (
+           i6 + m_dim.N6 * i7 ))))));
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return true ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N0 * m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < dimension_type::rank ) { s[1] = m_dim.N0 ; }
+      if ( 1 < dimension_type::rank ) { s[2] = s[1] * m_dim.N1 ; }
+      if ( 2 < dimension_type::rank ) { s[3] = s[2] * m_dim.N2 ; }
+      if ( 3 < dimension_type::rank ) { s[4] = s[3] * m_dim.N3 ; }
+      if ( 4 < dimension_type::rank ) { s[5] = s[4] * m_dim.N4 ; }
+      if ( 5 < dimension_type::rank ) { s[6] = s[5] * m_dim.N5 ; }
+      if ( 6 < dimension_type::rank ) { s[7] = s[6] * m_dim.N6 ; }
+      if ( 7 < dimension_type::rank ) { s[8] = s[7] * m_dim.N7 ; }
+    }
+
+  //----------------------------------------
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( std::integral_constant<unsigned,TrivialScalarSize> const &
+                      , size_t aN0   , unsigned aN1 , unsigned aN2 , unsigned aN3
+                      , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned aN7 )
+    : m_dim( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    } 
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
+                   , "ViewOffset LayoutLeft and LayoutRight are only compatible when rank == 1" );
+    }
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
+                   , "ViewOffset LayoutLeft and LayoutStride are only compatible when rank == 1" );
+      if ( rhs.m_stride.S0 != 1 ) {
+        Kokkos::abort("Kokkos::Experimental::ViewOffset assignment of LayoutLeft from LayoutStride  requires stride == 1" );
+      }
+    }
+
+  //----------------------------------------
+  // Subview construction
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs
+                      , const size_t n0
+                      , const size_t
+                      , const size_t
+                      , const size_t
+                      , const size_t
+                      , const size_t
+                      , const size_t
+                      , const size_t
+                      )
+    : m_dim( n0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( ( 0 == dimension_type::rank ) ||
+                     ( 1 == dimension_type::rank && 1 == dimension_type::rank_dynamic && 1 <= DimRHS::rank )
+                   , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutLeft
+                 , typename std::enable_if<( 1 < Dimension::rank
+                                             &&
+                                             0 < Dimension::rank_dynamic
+                                           )>::type >
+{
+  typedef size_t             size_type ;
+  typedef Dimension          dimension_type ;
+  typedef Kokkos::LayoutLeft array_layout ;
+
+  dimension_type m_dim ;
+  size_type      m_stride ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i0 + m_stride * i1 ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i0 + m_stride * ( i1 + m_dim.N1 * i2 );
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * i3 ));
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * i4 )));
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * i5 ))));
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i0 + m_stride * (
+           i1 + m_dim.N1 * (
+           i2 + m_dim.N2 * (
+           i3 + m_dim.N3 * (
+           i4 + m_dim.N4 * (
+           i5 + m_dim.N5 * (
+           i6 + m_dim.N6 * i7 ))))));
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_stride == m_dim.N0 ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_stride ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_stride * m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_stride * m_dim.N1 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_stride * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < dimension_type::rank ) { s[1] = m_stride ; }
+      if ( 1 < dimension_type::rank ) { s[2] = s[1] * m_dim.N1 ; }
+      if ( 2 < dimension_type::rank ) { s[3] = s[2] * m_dim.N2 ; }
+      if ( 3 < dimension_type::rank ) { s[4] = s[3] * m_dim.N3 ; }
+      if ( 4 < dimension_type::rank ) { s[5] = s[4] * m_dim.N4 ; }
+      if ( 5 < dimension_type::rank ) { s[6] = s[5] * m_dim.N5 ; }
+      if ( 6 < dimension_type::rank ) { s[7] = s[6] * m_dim.N6 ; }
+      if ( 7 < dimension_type::rank ) { s[8] = s[7] * m_dim.N7 ; }
+    }
+
+  //----------------------------------------
+
+private:
+
+  template< unsigned TrivialScalarSize >
+  struct Padding {
+    enum { div = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT / ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+    enum { mod = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT % ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+
+    // If memory alignment is a multiple of the trivial scalar size then attempt to align.
+    enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
+    enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
+
+    KOKKOS_INLINE_FUNCTION
+    static constexpr size_t stride( size_t const N )
+      {
+        return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
+               ? N + align - ( N % div_ok ) : N ;
+      }
+  };
+
+public:
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  /* Enable padding for trivial scalar types with non-zero trivial scalar size */
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
+                      , size_t aN0   , unsigned aN1 , unsigned aN2 , unsigned aN3
+                      , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned aN7 )
+    : m_dim( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+    , m_stride( Padding<TrivialScalarSize>::stride( aN0 ) )
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    , m_stride( rhs.stride_1() )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    } 
+
+  //----------------------------------------
+  // Subview construction
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs
+                      , const size_t aN0
+                      , const size_t aN1
+                      , const size_t aN2
+                      , const size_t aN3
+                      , const size_t aN4
+                      , const size_t aN5
+                      , const size_t aN6
+                      , const size_t aN7
+                      )
+    : m_dim( aN0
+           , ( 1 < DimRHS::rank && aN1 ? aN1 :
+             ( 2 < DimRHS::rank && aN2 ? aN2 :
+             ( 3 < DimRHS::rank && aN3 ? aN3 :
+             ( 4 < DimRHS::rank && aN4 ? aN4 :
+             ( 5 < DimRHS::rank && aN5 ? aN5 :
+             ( 6 < DimRHS::rank && aN6 ? aN6 :
+             ( 7 < DimRHS::rank && aN7 ? aN7 : 0 )))))))
+           , 0, 0, 0, 0, 0, 0 )
+    , m_stride( ( 1 < DimRHS::rank && aN1 ? rhs.stride_1() :
+                ( 2 < DimRHS::rank && aN2 ? rhs.stride_2() :
+                ( 3 < DimRHS::rank && aN3 ? rhs.stride_3() :
+                ( 4 < DimRHS::rank && aN4 ? rhs.stride_4() :
+                ( 5 < DimRHS::rank && aN5 ? rhs.stride_5() :
+                ( 6 < DimRHS::rank && aN6 ? rhs.stride_6() :
+                ( 7 < DimRHS::rank && aN7 ? rhs.stride_7() : 0 ))))))) )
+    {
+      // This subview must be 2 == rank and 2 == rank_dynamic
+      // due to only having stride #0.
+      // The source dimension #0 must be non-zero for stride-one leading dimension.
+      // At most subsequent dimension can be non-zero.
+
+      static_assert( ( 2 == dimension_type::rank ) &&
+                     ( 2 == dimension_type::rank_dynamic ) &&
+                     ( 2 <= DimRHS::rank )
+                   , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutRight
+                 , typename std::enable_if<( 1 >= Dimension::rank
+                                             ||
+                                             0 == Dimension::rank_dynamic
+                                           )>::type >
+{
+  typedef size_t              size_type ;
+  typedef Dimension           dimension_type ;
+  typedef Kokkos::LayoutRight array_layout ;
+
+  dimension_type m_dim ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i1 + m_dim.N1 * i0 ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i2 + m_dim.N2 * ( i1 + m_dim.N1 * ( i0 ));
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 )));
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 ))));
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 )))));
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 ))))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i7 + m_dim.N7 * (
+           i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * (
+           i1 + m_dim.N1 * ( i0 )))))));
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return true ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N7 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N7 * m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < dimension_type::rank ) { s[7] = n ; n *= m_dim.N7 ; }
+      if ( 6 < dimension_type::rank ) { s[6] = n ; n *= m_dim.N6 ; }
+      if ( 5 < dimension_type::rank ) { s[5] = n ; n *= m_dim.N5 ; }
+      if ( 4 < dimension_type::rank ) { s[4] = n ; n *= m_dim.N4 ; }
+      if ( 3 < dimension_type::rank ) { s[3] = n ; n *= m_dim.N3 ; }
+      if ( 2 < dimension_type::rank ) { s[2] = n ; n *= m_dim.N2 ; }
+      if ( 1 < dimension_type::rank ) { s[1] = n ; n *= m_dim.N1 ; }
+      if ( 0 < dimension_type::rank ) { s[0] = n ; }
+      s[dimension_type::rank] = n * m_dim.N0 ;
+    }
+
+  //----------------------------------------
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( std::integral_constant<unsigned,TrivialScalarSize> const &
+                      , size_t aN0   , unsigned aN1 , unsigned aN2 , unsigned aN3
+                      , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned aN7 )
+    : m_dim( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    } 
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
+                   , "ViewOffset LayoutRight and LayoutLeft are only compatible when rank == 1" );
+    }
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutStride , void > & rhs )
+    : m_dim( rhs.m_dim.N0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( DimRHS::rank == 1 && dimension_type::rank == 1 && dimension_type::rank_dynamic == 1
+                   , "ViewOffset LayoutLeft and LayoutStride are only compatible when rank == 1" );
+      if ( rhs.m_stride.S0 != 1 ) {
+        Kokkos::abort("Kokkos::Experimental::ViewOffset assignment of LayoutRight from LayoutStride  requires stride == 1" );
+      }
+    }
+
+  //----------------------------------------
+  // Subview construction
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs
+                      , const size_t n0
+                      , const size_t
+                      , const size_t
+                      , const size_t
+                      , const size_t
+                      , const size_t
+                      , const size_t
+                      , const size_t
+                      )
+    : m_dim( n0, 0, 0, 0, 0, 0, 0, 0 )
+    {
+      static_assert( ( 0 == dimension_type::rank ) ||
+                     ( 1 == dimension_type::rank && 1 == dimension_type::rank_dynamic && 1 <= DimRHS::rank )
+                   , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutRight
+                 , typename std::enable_if<( 1 < Dimension::rank
+                                             &&
+                                             0 < Dimension::rank_dynamic
+                                           )>::type >
+{
+  typedef size_t               size_type ;
+  typedef Dimension            dimension_type ;
+  typedef Kokkos::LayoutRight  array_layout ;
+
+  dimension_type m_dim ;
+  size_type      m_stride ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+  { return i1 + i0 * m_stride ; }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  { return i2 + m_dim.N2 * ( i1 ) + i0 * m_stride ; }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 )) +
+           i0 * m_stride ;
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 ))) +
+           i0 * m_stride ;
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 )))) +
+           i0 * m_stride ;
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 ))))) +
+           i0 * m_stride ;
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i7 + m_dim.N7 * (
+           i6 + m_dim.N6 * (
+           i5 + m_dim.N5 * (
+           i4 + m_dim.N4 * (
+           i3 + m_dim.N3 * (
+           i2 + m_dim.N2 * ( i1 )))))) +
+           i0 * m_stride ;
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+  /* Span of the range space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    { return m_dim.N0 * m_stride ; }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const
+    { return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1 ; }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_dim.N7 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_dim.N7 * m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < dimension_type::rank ) { s[7] = n ; n *= m_dim.N7 ; }
+      if ( 6 < dimension_type::rank ) { s[6] = n ; n *= m_dim.N6 ; }
+      if ( 5 < dimension_type::rank ) { s[5] = n ; n *= m_dim.N5 ; }
+      if ( 4 < dimension_type::rank ) { s[4] = n ; n *= m_dim.N4 ; }
+      if ( 3 < dimension_type::rank ) { s[3] = n ; n *= m_dim.N3 ; }
+      if ( 2 < dimension_type::rank ) { s[2] = n ; n *= m_dim.N2 ; }
+      if ( 1 < dimension_type::rank ) { s[1] = n ; }
+      if ( 0 < dimension_type::rank ) { s[0] = m_stride ; }
+      s[dimension_type::rank] = m_stride * m_dim.N0 ;
+    }
+
+  //----------------------------------------
+
+private:
+
+  template< unsigned TrivialScalarSize >
+  struct Padding {
+    enum { div = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT / ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+    enum { mod = TrivialScalarSize == 0 ? 0 : Kokkos::Impl::MEMORY_ALIGNMENT % ( TrivialScalarSize ? TrivialScalarSize : 1 ) };
+
+    // If memory alignment is a multiple of the trivial scalar size then attempt to align.
+    enum { align = 0 != TrivialScalarSize && 0 == mod ? div : 0 };
+    enum { div_ok = div ? div : 1 }; // To valid modulo zero in constexpr
+
+    KOKKOS_INLINE_FUNCTION
+    static constexpr size_t stride( size_t const N )
+    {
+      return ( align && ( Kokkos::Impl::MEMORY_ALIGNMENT_THRESHOLD * align < N ) && ( N % div_ok ) )
+             ? N + align - ( N % div_ok ) : N ;
+    }
+  };
+
+public:
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  /* Enable padding for trivial scalar types with non-zero trivial scalar size.  */
+  template< unsigned TrivialScalarSize >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( std::integral_constant<unsigned,TrivialScalarSize> const & padding_type_size
+                      , size_t aN0   , unsigned aN1 , unsigned aN2 , unsigned aN3
+                      , unsigned aN4 , unsigned aN5 , unsigned aN6 , unsigned aN7 )
+    : m_dim( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+    , m_stride( Padding<TrivialScalarSize>::
+                  stride( /* 2 <= rank */
+                          m_dim.N1 * ( dimension_type::rank == 2 ? 1 :
+                          m_dim.N2 * ( dimension_type::rank == 3 ? 1 :
+                          m_dim.N3 * ( dimension_type::rank == 4 ? 1 :
+                          m_dim.N4 * ( dimension_type::rank == 5 ? 1 :
+                          m_dim.N5 * ( dimension_type::rank == 6 ? 1 :
+                          m_dim.N6 * ( dimension_type::rank == 7 ? 1 : m_dim.N7 )))))) ))
+    {}
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    , m_stride( rhs.stride_0() )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    } 
+
+  //----------------------------------------
+  // Subview construction
+  // Last dimension must be non-zero
+
+  template< class DimRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , Kokkos::LayoutRight , void > & rhs
+                      , const size_t aN0
+                      , const size_t aN1
+                      , const size_t aN2
+                      , const size_t aN3
+                      , const size_t aN4
+                      , const size_t aN5
+                      , const size_t aN6
+                      , const size_t aN7
+                      )
+    : m_dim( // N0 == First non-zero dimension before the last dimension.
+             ( 1 < DimRHS::rank && aN0 ? aN0 :
+             ( 2 < DimRHS::rank && aN1 ? aN1 :
+             ( 3 < DimRHS::rank && aN2 ? aN2 :
+             ( 4 < DimRHS::rank && aN3 ? aN3 :
+             ( 5 < DimRHS::rank && aN4 ? aN4 :
+             ( 6 < DimRHS::rank && aN5 ? aN5 :
+             ( 7 < DimRHS::rank && aN6 ? aN6 : 0 )))))))
+           , // N1 == Last dimension.
+             ( 2 == DimRHS::rank ? aN1 :
+             ( 3 == DimRHS::rank ? aN2 :
+             ( 4 == DimRHS::rank ? aN3 :
+             ( 5 == DimRHS::rank ? aN4 :
+             ( 6 == DimRHS::rank ? aN5 :
+             ( 7 == DimRHS::rank ? aN6 : aN7 ))))))
+           , 0, 0, 0, 0, 0, 0 )
+    , m_stride( ( 1 < DimRHS::rank && aN0 ? rhs.stride_0() :
+                ( 2 < DimRHS::rank && aN1 ? rhs.stride_1() :
+                ( 3 < DimRHS::rank && aN2 ? rhs.stride_2() :
+                ( 4 < DimRHS::rank && aN3 ? rhs.stride_3() :
+                ( 5 < DimRHS::rank && aN4 ? rhs.stride_4() :
+                ( 6 < DimRHS::rank && aN5 ? rhs.stride_5() :
+                ( 7 < DimRHS::rank && aN6 ? rhs.stride_6() : 0 ))))))) )
+    {
+      // This subview must be 2 == rank and 2 == rank_dynamic
+      // due to only having stride #0.
+      // The source dimension #0 must be non-zero for stride-one leading dimension.
+      // At most subsequent dimension can be non-zero.
+
+      static_assert( ( 2 == dimension_type::rank ) &&
+                     ( 2 == dimension_type::rank_dynamic ) &&
+                     ( 2 <= DimRHS::rank )
+                   , "ViewOffset subview construction requires compatible rank" );
+    }
+};
+
+//----------------------------------------------------------------------------
+/* Strided array layout only makes sense for 0 < rank */
+
+template< unsigned Rank >
+struct ViewStride ;
+
+template<>
+struct ViewStride<1> {
+  size_t S0 ;
+  enum { S1 = 0 , S2 = 0 , S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t , size_t , size_t
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 )
+    {}
+};
+
+template<>
+struct ViewStride<2> {
+  size_t S0 , S1 ;
+  enum { S2 = 0 , S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t , size_t
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 )
+    {}
+};
+
+template<>
+struct ViewStride<3> {
+  size_t S0 , S1 , S2 ;
+  enum { S3 = 0 , S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 )
+    {}
+};
+
+template<>
+struct ViewStride<4> {
+  size_t S0 , S1 , S2 , S3 ;
+  enum { S4 = 0 , S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    {}
+};
+
+template<>
+struct ViewStride<5> {
+  size_t S0 , S1 , S2 , S3 , S4 ;
+  enum { S5 = 0 , S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 )
+    {}
+};
+
+template<>
+struct ViewStride<6> {
+  size_t S0 , S1 , S2 , S3 , S4 , S5 ;
+  enum { S6 = 0 , S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t aS5 , size_t , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 ) , S5( aS5 )
+    {}
+};
+
+template<>
+struct ViewStride<7> {
+  size_t S0 , S1 , S2 , S3 , S4 , S5 , S6 ;
+  enum { S7 = 0 };
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t aS5 , size_t aS6 , size_t )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 ) , S5( aS5 ) , S6( aS6 )
+    {}
+};
+
+template<>
+struct ViewStride<8> {
+  size_t S0 , S1 , S2 , S3 , S4 , S5 , S6 , S7 ;
+
+  ViewStride() = default ;
+  ViewStride( const ViewStride & ) = default ;
+  ViewStride & operator = ( const ViewStride & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewStride( size_t aS0 , size_t aS1 , size_t aS2 , size_t aS3
+                      , size_t aS4 , size_t aS5 , size_t aS6 , size_t aS7 )
+    : S0( aS0 ) , S1( aS1 ) , S2( aS2 ) , S3( aS3 )
+    , S4( aS4 ) , S5( aS5 ) , S6( aS6 ) , S7( aS7 )
+    {}
+};
+
+template < class Dimension >
+struct ViewOffset< Dimension , Kokkos::LayoutStride
+                 , typename std::enable_if<( 0 < Dimension::rank )>::type >
+{
+private:
+  typedef ViewStride< Dimension::rank >  stride_type ;
+public:
+
+  typedef size_t                size_type ;
+  typedef Dimension             dimension_type ;
+  typedef Kokkos::LayoutStride  array_layout ;
+
+  dimension_type  m_dim ;
+  stride_type     m_stride ;
+
+  //----------------------------------------
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 ) const
+  {
+    return i0 * m_stride.S0 ;
+  }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 ;
+  }
+
+  //rank 3
+  template < typename I0, typename I1, typename I2 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 ;
+  }
+
+  //rank 4
+  template < typename I0, typename I1, typename I2, typename I3 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 ;
+  }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 ;
+  }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 +
+           i5 * m_stride.S5 ;
+  }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 +
+           i5 * m_stride.S5 +
+           i6 * m_stride.S6 ;
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+           , typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_INLINE_FUNCTION constexpr
+  size_type operator()( I0 const & i0, I1 const & i1, I2 const & i2, I3 const & i3
+                      , I4 const & i4, I5 const & i5, I6 const & i6, I7 const & i7 ) const
+  {
+    return i0 * m_stride.S0 +
+           i1 * m_stride.S1 +
+           i2 * m_stride.S2 +
+           i3 * m_stride.S3 +
+           i4 * m_stride.S4 +
+           i5 * m_stride.S5 +
+           i6 * m_stride.S6 +
+           i7 * m_stride.S7 ;
+  }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { return m_dim.N0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_1() const { return m_dim.N1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_2() const { return m_dim.N2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_3() const { return m_dim.N3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_4() const { return m_dim.N4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_5() const { return m_dim.N5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_6() const { return m_dim.N6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type dimension_7() const { return m_dim.N7 ; }
+
+  /* Cardinality of the domain index space */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type size() const
+    { return m_dim.N0 * m_dim.N1 * m_dim.N2 * m_dim.N3 * m_dim.N4 * m_dim.N5 * m_dim.N6 * m_dim.N7 ; }
+
+private:
+
+  KOKKOS_INLINE_FUNCTION
+  static constexpr size_type Max( size_type lhs , size_type rhs )
+    { return lhs < rhs ? rhs : lhs ; }
+
+public:
+
+  /* Span of the range space, largest stride * dimension */
+  KOKKOS_INLINE_FUNCTION
+  constexpr size_type span() const
+    {
+      return Max( m_dim.N0 * m_stride.S0 ,
+             Max( m_dim.N1 * m_stride.S1 ,
+             Max( m_dim.N2 * m_stride.S2 ,
+             Max( m_dim.N3 * m_stride.S3 ,
+             Max( m_dim.N4 * m_stride.S4 ,
+             Max( m_dim.N5 * m_stride.S5 ,
+             Max( m_dim.N6 * m_stride.S6 ,
+                  m_dim.N7 * m_stride.S7 )))))));
+    }
+
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return span() == size(); }
+
+  /* Strides of dimensions */
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride.S0 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { return m_stride.S1 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { return m_stride.S2 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { return m_stride.S3 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { return m_stride.S4 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { return m_stride.S5 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_6() const { return m_stride.S6 ; }
+  KOKKOS_INLINE_FUNCTION constexpr size_type stride_7() const { return m_stride.S7 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      if ( 0 < dimension_type::rank ) { s[0] = m_stride.S0 ; }
+      if ( 1 < dimension_type::rank ) { s[1] = m_stride.S1 ; }
+      if ( 2 < dimension_type::rank ) { s[2] = m_stride.S2 ; }
+      if ( 3 < dimension_type::rank ) { s[3] = m_stride.S3 ; }
+      if ( 4 < dimension_type::rank ) { s[4] = m_stride.S4 ; }
+      if ( 5 < dimension_type::rank ) { s[5] = m_stride.S5 ; }
+      if ( 6 < dimension_type::rank ) { s[6] = m_stride.S6 ; }
+      if ( 7 < dimension_type::rank ) { s[7] = m_stride.S7 ; }
+      s[dimension_type::rank] = span();
+    }
+
+  //----------------------------------------
+
+  ViewOffset() = default ;
+  ViewOffset( const ViewOffset & ) = default ;
+  ViewOffset & operator = ( const ViewOffset & ) = default ;
+
+  KOKKOS_INLINE_FUNCTION
+  ViewOffset( const Kokkos::LayoutStride & rhs )
+    : m_dim( rhs.dimension[0] , rhs.dimension[1] , rhs.dimension[2] , rhs.dimension[3]
+           , rhs.dimension[4] , rhs.dimension[5] , rhs.dimension[6] , rhs.dimension[7] )
+    , m_stride( rhs.stride[0] , rhs.stride[1] , rhs.stride[2] , rhs.stride[3]
+              , rhs.stride[4] , rhs.stride[5] , rhs.stride[6] , rhs.stride[7] )
+    {}
+
+  template< class DimRHS , class LayoutRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , LayoutRHS , void > & rhs )
+    : m_dim( rhs.m_dim.N0 , rhs.m_dim.N1 , rhs.m_dim.N2 , rhs.m_dim.N3 
+           , rhs.m_dim.N4 , rhs.m_dim.N5 , rhs.m_dim.N6 , rhs.m_dim.N7 )
+    , m_stride( rhs.stride_0() , rhs.stride_1() , rhs.stride_2() , rhs.stride_3()
+              , rhs.stride_4() , rhs.stride_5() , rhs.stride_6() , rhs.stride_7() )
+    {
+      static_assert( int(DimRHS::rank) == int(dimension_type::rank) , "ViewOffset assignment requires equal rank" );
+      // Also requires equal static dimensions ...
+    }
+
+  //----------------------------------------
+  // Subview construction
+
+private:
+
+  KOKKOS_INLINE_FUNCTION
+  static constexpr unsigned
+    count_non_zero( const size_t aN0 = 0 
+                  , const size_t aN1 = 0 
+                  , const size_t aN2 = 0 
+                  , const size_t aN3 = 0 
+                  , const size_t aN4 = 0 
+                  , const size_t aN5 = 0 
+                  , const size_t aN6 = 0 
+                  , const size_t aN7 = 0 
+                  )
+    {
+      return ( aN0 ? 1 : 0 ) +
+             ( aN1 ? 1 : 0 ) +
+             ( aN2 ? 1 : 0 ) +
+             ( aN3 ? 1 : 0 ) +
+             ( aN4 ? 1 : 0 ) +
+             ( aN5 ? 1 : 0 ) +
+             ( aN6 ? 1 : 0 ) +
+             ( aN7 ? 1 : 0 );
+    }
+
+  template< unsigned Rank , unsigned I >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr size_t
+    get_non_zero( const size_t aN0
+                , const size_t aN1
+                , const size_t aN2
+                , const size_t aN3
+                , const size_t aN4
+                , const size_t aN5
+                , const size_t aN6
+                , const size_t aN7
+                )
+    {
+      return ( 0 < Rank && I < 1                                                     && aN0 ? aN0 :
+             ( 1 < Rank && I < 2 && I == count_non_zero(aN0)                         && aN1 ? aN1 :
+             ( 2 < Rank && I < 3 && I == count_non_zero(aN0,aN1)                     && aN2 ? aN2 :
+             ( 3 < Rank && I < 4 && I == count_non_zero(aN0,aN1,aN2)                 && aN3 ? aN3 :
+             ( 4 < Rank && I < 5 && I == count_non_zero(aN0,aN1,aN2,aN3)             && aN4 ? aN4 :
+             ( 5 < Rank && I < 6 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4)         && aN5 ? aN5 :
+             ( 6 < Rank && I < 7 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4,aN5)     && aN6 ? aN6 :
+             ( 7 < Rank && I < 8 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4,aN5,aN6) && aN7 ? aN7 : 0 ))))))));
+    }
+  
+  template< unsigned Rank , unsigned I , class DimRHS , class LayoutRHS >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr size_t
+    get_non_zero( const size_t aN0 , const size_t aN1 , const size_t aN2 , const size_t aN3
+                , const size_t aN4 , const size_t aN5 , const size_t aN6 , const size_t aN7
+                , const ViewOffset< DimRHS , LayoutRHS , void > & rhs )
+    {
+      return ( 0 < Rank && I < 1                                                     && aN0 ? rhs.stride_0() :
+             ( 1 < Rank && I < 2 && I == count_non_zero(aN0)                         && aN1 ? rhs.stride_1() :
+             ( 2 < Rank && I < 3 && I == count_non_zero(aN0,aN1)                     && aN2 ? rhs.stride_2() :
+             ( 3 < Rank && I < 4 && I == count_non_zero(aN0,aN1,aN2)                 && aN3 ? rhs.stride_3() :
+             ( 4 < Rank && I < 5 && I == count_non_zero(aN0,aN1,aN2,aN3)             && aN4 ? rhs.stride_4() :
+             ( 5 < Rank && I < 6 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4)         && aN5 ? rhs.stride_5() :
+             ( 6 < Rank && I < 7 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4,aN5)     && aN6 ? rhs.stride_6() :
+             ( 7 < Rank && I < 8 && I == count_non_zero(aN0,aN1,aN2,aN3,aN4,aN5,aN6) && aN7 ? rhs.stride_7() : 0 ))))))));
+    }
+  
+
+public:
+
+  template< class DimRHS , class LayoutRHS >
+  KOKKOS_INLINE_FUNCTION
+  constexpr ViewOffset( const ViewOffset< DimRHS , LayoutRHS , void > & rhs
+                      , const size_t aN0
+                      , const size_t aN1
+                      , const size_t aN2
+                      , const size_t aN3
+                      , const size_t aN4
+                      , const size_t aN5
+                      , const size_t aN6
+                      , const size_t aN7
+                      )
+    // Contract the non-zero dimensions
+    : m_dim( ViewOffset::template get_non_zero<DimRHS::rank,0>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+           , ViewOffset::template get_non_zero<DimRHS::rank,1>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+           , ViewOffset::template get_non_zero<DimRHS::rank,2>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+           , ViewOffset::template get_non_zero<DimRHS::rank,3>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+           , ViewOffset::template get_non_zero<DimRHS::rank,4>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+           , ViewOffset::template get_non_zero<DimRHS::rank,5>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+           , ViewOffset::template get_non_zero<DimRHS::rank,6>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+           , ViewOffset::template get_non_zero<DimRHS::rank,7>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7 )
+           )
+    , m_stride( ViewOffset::template get_non_zero<DimRHS::rank,0>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs )
+              , ViewOffset::template get_non_zero<DimRHS::rank,1>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs )
+              , ViewOffset::template get_non_zero<DimRHS::rank,2>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs )
+              , ViewOffset::template get_non_zero<DimRHS::rank,3>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs )
+              , ViewOffset::template get_non_zero<DimRHS::rank,4>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs )
+              , ViewOffset::template get_non_zero<DimRHS::rank,5>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs )
+              , ViewOffset::template get_non_zero<DimRHS::rank,6>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs )
+              , ViewOffset::template get_non_zero<DimRHS::rank,7>( aN0, aN1, aN2, aN3, aN4, aN5, aN6, aN7, rhs )
+              )
+    {
+    }
+
+  //----------------------------------------
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+struct ALL_t {};
+
+template< class T >
+struct ViewOffsetRange {
+
+  static_assert( std::is_integral<T>::value , "Non-range must be an integral type" );
+
+  enum { is_range = false };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const , T const & ) { return 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( T const & i ) { return size_t(i) ; }
+};
+
+template<>
+struct ViewOffsetRange<void> {
+  enum { is_range = false };
+};
+
+template<>
+struct ViewOffsetRange< Kokkos::Experimental::Impl::ALL_t > {
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , Experimental::Impl::ALL_t const & ) { return n ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( Experimental::Impl::ALL_t const & ) { return 0 ; }
+};
+
+template< typename iType >
+struct ViewOffsetRange< std::pair<iType,iType> > {
+
+  static_assert( std::is_integral<iType>::value , "Range bounds must be an integral type" );
+
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , std::pair<iType,iType> const & r )
+    { return ( size_t(r.first) < size_t(r.second) && size_t(r.second) <= n ) ? size_t(r.second) - size_t(r.first) : 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( std::pair<iType,iType> const & r ) { return size_t(r.first) ; }
+};
+
+template< typename iType >
+struct ViewOffsetRange< Kokkos::pair<iType,iType> > {
+
+  static_assert( std::is_integral<iType>::value , "Range bounds must be an integral type" );
+
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , Kokkos::pair<iType,iType> const & r )
+    { return ( size_t(r.first) < size_t(r.second) && size_t(r.second) <= n ) ? size_t(r.second) - size_t(r.first) : 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( Kokkos::pair<iType,iType> const & r ) { return size_t(r.first) ; }
+};
+
+template< typename iType >
+struct ViewOffsetRange< std::initializer_list< iType > > {
+
+  static_assert( std::is_integral<iType>::value , "Range bounds must be an integral type" );
+
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , std::initializer_list< iType > const & r )
+    {
+      return ( size_t(r.begin()[0]) < size_t(r.begin()[1]) && size_t(r.begin()[1]) <= n )
+             ? size_t(r.begin()[1]) - size_t(r.begin()[0]) : 0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( std::initializer_list< iType > const & r ) { return size_t(r.begin()[0]) ; }
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  ViewDataHandle provides the type of the 'data handle' which the view
+ *          uses to access data with the [] operator. It also provides
+ *          an allocate function and a function to extract a raw ptr from the
+ *          data handle. ViewDataHandle also defines an enum ReferenceAble which
+ *          specifies whether references/pointers to elements can be taken and a
+ *          'return_type' which is what the view operators will give back.
+ *          Specialisation of this object allows three things depending
+ *          on ViewTraits and compiler options:
+ *          (i)   Use special allocator (e.g. huge pages/small pages and pinned memory)
+ *          (ii)  Use special data handle type (e.g. add Cuda Texture Object)
+ *          (iii) Use special access intrinsics (e.g. texture fetch and non-caching loads)
+ */
+template< class Traits , class Enable = void >
+struct ViewDataHandle {
+
+  typedef typename Traits::value_type   value_type  ;
+  typedef typename Traits::value_type * handle_type ;
+  typedef typename Traits::value_type & return_type ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker  track_type  ;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr
+                           , track_type const & /*arg_tracker*/ )
+  {
+    return handle_type( arg_data_ptr );
+  }
+};
+
+template< class Traits >
+struct ViewDataHandle< Traits ,
+  typename std::enable_if<( std::is_same< typename Traits::non_const_value_type
+                                        , typename Traits::value_type >::value
+                            &&
+                            Traits::memory_traits::Atomic
+                          )>::type >
+{
+  typedef typename Traits::value_type  value_type ;
+  typedef typename Kokkos::Impl::AtomicViewDataHandle< Traits >  handle_type ;
+  typedef typename Kokkos::Impl::AtomicDataElement< Traits >     return_type ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker    track_type  ;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type assign( value_type * arg_data_ptr
+                           , track_type const & /*arg_tracker*/ )
+  {
+    return handle_type( arg_data_ptr );
+  }
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class Traits
+        , bool R0 = false
+        , bool R1 = false
+        , bool R2 = false
+        , bool R3 = false
+        , bool R4 = false
+        , bool R5 = false
+        , bool R6 = false
+        , bool R7 = false
+        , typename Enable = void >
+struct SubviewMapping ;
+
+/** \brief  View mapping for non-specialized data type and standard layout */
+template< class Traits >
+class ViewMapping< Traits , void ,
+  typename std::enable_if<(
+    std::is_same< typename Traits::specialize , void >::value
+    &&
+    (
+      std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value
+    )
+  )>::type >
+{
+private:
+
+  template< class , class , typename > friend class ViewMapping ;
+  template< class , bool , bool , bool , bool , bool , bool , bool , bool , class > friend struct SubviewMapping ;
+  template< class , class , class , class > friend class Kokkos::Experimental::View ;
+
+  typedef ViewOffset< typename Traits::dimension
+                    , typename Traits::array_layout
+                    , void
+                    >  offset_type ;
+
+  typedef typename ViewDataHandle< Traits >::handle_type  handle_type ;
+
+  handle_type  m_handle ;
+  offset_type  m_offset ;
+
+public:
+
+  //----------------------------------------
+  // Domain dimensions
+
+  enum { Rank = Traits::dimension::rank };
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return m_offset.dimension_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return m_offset.dimension_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return m_offset.dimension_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return m_offset.dimension_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_4() const { return m_offset.dimension_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_5() const { return m_offset.dimension_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_6() const { return m_offset.dimension_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t dimension_7() const { return m_offset.dimension_7(); }
+
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { return m_offset.stride_0(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { return m_offset.stride_1(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { return m_offset.stride_2(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { return m_offset.stride_3(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { return m_offset.stride_4(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { return m_offset.stride_5(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { return m_offset.stride_6(); }
+  KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { return m_offset.stride_7(); }
+
+  /*
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Array<size_t,Rank> dimension() const
+    { return Kokkos::Experimental::Impl::dimension( m_offset.m_dim ); }
+  */
+
+  //----------------------------------------
+  // Range span
+
+  /** \brief  Span of the mapped range */
+  KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_offset.span(); }
+
+  /** \brief  Is the mapped range span contiguous */
+  KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { return m_offset.span_is_contiguous(); }
+
+  typedef typename ViewDataHandle< Traits >::return_type  reference_type ;
+
+  /** \brief  If data references are lvalue_reference than can query pointer to memory */
+  KOKKOS_INLINE_FUNCTION constexpr typename Traits::value_type * data() const
+    {
+      typedef typename Traits::value_type * ptr_type ;
+
+      return std::is_lvalue_reference< reference_type >::value
+             ? (ptr_type) m_handle
+             : (ptr_type) 0 ;
+    }
+
+  //----------------------------------------
+  // The View class performs all rank and bounds checking before
+  // calling these element reference methods.
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference() const { return m_handle[0]; }
+
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 ) const { return m_handle[i0]; }
+
+  template< typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 ) const
+    { return m_handle[ m_offset(i0,i1) ]; }
+
+  template< typename I0 , typename I1 , typename I2 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 ) const
+    { return m_handle[ m_offset(i0,i1,i2) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 , const I6 & i6 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5,i6) ]; }
+
+  template< typename I0 , typename I1 , typename I2 , typename I3
+          , typename I4 , typename I5 , typename I6 , typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  reference_type reference( const I0 & i0 , const I1 & i1 , const I2 & i2 , const I3 & i3
+                          , const I4 & i4 , const I5 & i5 , const I6 & i6 , const I7 & i7 ) const
+    { return m_handle[ m_offset(i0,i1,i2,i3,i4,i5,i6,i7) ]; }
+
+  //----------------------------------------
+
+private:
+
+  enum { MemorySpanMask = 8 - 1 /* Force alignment on 8 byte boundary */ };
+  enum { MemorySpanSize = sizeof(typename Traits::value_type) };
+
+public:
+
+  /** \brief  Span, in bytes, of the referenced memory */
+  KOKKOS_INLINE_FUNCTION constexpr size_t memory_span() const
+    {
+      return ( m_offset.span() * sizeof(typename Traits::value_type) + MemorySpanMask ) & ~size_t(MemorySpanMask);
+    }
+
+  /** \brief  Span, in bytes, of the required memory */
+  template< bool AllowPadding >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr size_t memory_span( const std::integral_constant<bool,AllowPadding> &
+                                     , const size_t N0 , const size_t N1 , const size_t N2 , const size_t N3
+                                      , const size_t N4 , const size_t N5 , const size_t N6 , const size_t N7 )
+    {
+      typedef std::integral_constant< unsigned , AllowPadding ? MemorySpanSize : 0 >  padding ;
+      return ( offset_type( padding(), N0, N1, N2, N3, N4, N5, N6, N7 ).span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
+    }
+
+  /** \brief  Span, in bytes, of the required memory */
+  template< bool AllowPadding >
+  KOKKOS_INLINE_FUNCTION
+  static constexpr size_t memory_span( const std::integral_constant<bool,AllowPadding> &
+                                       , const typename Traits::array_layout & layout )
+    {
+      return ( offset_type( layout ).span() * MemorySpanSize + MemorySpanMask ) & ~size_t(MemorySpanMask);
+    }
+
+  //----------------------------------------
+
+  KOKKOS_INLINE_FUNCTION ~ViewMapping() {}
+  KOKKOS_INLINE_FUNCTION ViewMapping() : m_handle(), m_offset() {}
+  KOKKOS_INLINE_FUNCTION ViewMapping( const ViewMapping & rhs )
+    : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( const ViewMapping & rhs )
+    { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; return *this ; }
+
+  KOKKOS_INLINE_FUNCTION ViewMapping( ViewMapping && rhs )
+    : m_handle( rhs.m_handle ), m_offset( rhs.m_offset ) {}
+  KOKKOS_INLINE_FUNCTION ViewMapping & operator = ( ViewMapping && rhs )
+    { m_handle = rhs.m_handle ; m_offset = rhs.m_offset ; return *this ; }
+
+  template< bool AllowPadding >
+  KOKKOS_INLINE_FUNCTION
+  ViewMapping( void * ptr
+             , const std::integral_constant<bool,AllowPadding> &
+             , const size_t N0 , const size_t N1 , const size_t N2 , const size_t N3
+             , const size_t N4 , const size_t N5 , const size_t N6 , const size_t N7 )
+    : m_handle( reinterpret_cast< handle_type >( ptr ) )
+    , m_offset( std::integral_constant< unsigned , AllowPadding ? sizeof(typename Traits::value_type) : 0 >()
+              , N0, N1, N2, N3, N4, N5, N6, N7 )
+    {}
+
+  template< bool AllowPadding >
+  KOKKOS_INLINE_FUNCTION
+  ViewMapping( void * ptr
+             , const std::integral_constant<bool,AllowPadding> &
+             , const typename Traits::array_layout & layout )
+    : m_handle( reinterpret_cast< handle_type >( ptr ) )
+    , m_offset( layout )
+    {}
+
+  //----------------------------------------
+  // If the View is to construct or destroy the elements.
+
+  struct FunctorTagConstructScalar {};
+  struct FunctorTagConstructNonScalar {};
+  struct FunctorTagDestructNonScalar {};
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( const FunctorTagConstructScalar & , const size_t i ) const
+    { m_handle[i] = 0 ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( const FunctorTagConstructNonScalar & , const size_t i ) const
+    { 
+      typedef typename Traits::value_type  value_type ;
+      new( & m_handle[i] ) value_type();
+    }
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( const FunctorTagDestructNonScalar & , const size_t i ) const
+    { 
+      typedef typename Traits::value_type  value_type ;
+      ( & (m_handle[i]) )->~value_type();
+    }
+
+  template< class ExecSpace >
+  typename std::enable_if< Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+                           std::is_scalar< typename Traits::value_type >::value >::type
+  construct( const ExecSpace & space ) const
+    {
+      typedef Kokkos::RangePolicy< ExecSpace , FunctorTagConstructScalar , size_t > Policy ;
+
+      (void) Kokkos::Impl::ParallelFor< ViewMapping , Policy >( *this , Policy( 0 , m_offset.span() ) );
+    }
+
+  template< class ExecSpace >
+  typename std::enable_if< Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+                           ! std::is_scalar< typename Traits::value_type >::value >::type
+  construct( const ExecSpace & space ) const
+    {
+      typedef Kokkos::RangePolicy< ExecSpace , FunctorTagConstructNonScalar , size_t > Policy ;
+
+      (void) Kokkos::Impl::ParallelFor< ViewMapping , Policy >( *this , Policy( 0 , m_offset.span() ) );
+    }
+
+  template< class ExecSpace >
+  typename std::enable_if< Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+                           std::is_scalar< typename Traits::value_type >::value >::type
+  destroy( const ExecSpace & ) const {}
+
+  template< class ExecSpace >
+  typename std::enable_if< Kokkos::Impl::is_execution_space<ExecSpace>::value &&
+                           ! std::is_scalar< typename Traits::value_type >::value >::type
+  destroy( const ExecSpace & space ) const
+    {
+      typedef Kokkos::RangePolicy< ExecSpace , FunctorTagDestructNonScalar , size_t > Policy ;
+
+      (void) Kokkos::Impl::ParallelFor< ViewMapping , Policy >( *this , Policy( 0 , m_offset.span() ) );
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+/** \brief  Assign compatible default mappings */
+
+template< class DstTraits , class SrcTraits >
+class ViewMapping< DstTraits , SrcTraits ,
+  typename std::enable_if<(
+    std::is_same< typename DstTraits::memory_space , typename SrcTraits::memory_space >::value
+    &&
+    std::is_same< typename DstTraits::specialize , void >::value
+    &&
+    (
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
+    )
+    &&
+    std::is_same< typename SrcTraits::specialize , void >::value
+    &&
+    (
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
+    )
+  )>::type >
+{
+public:
+
+  enum { is_assignable = true };
+
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker  TrackType ;
+  typedef ViewMapping< DstTraits , void , void >  DstType ;
+  typedef ViewMapping< SrcTraits , void , void >  SrcType ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void assign( DstType & dst , const SrcType & src , const TrackType & src_track )
+    {
+      static_assert( std::is_same< typename DstTraits::value_type , typename SrcTraits::value_type >::value ||
+                     std::is_same< typename DstTraits::value_type , typename SrcTraits::const_value_type >::value
+                   , "View assignment must have same value type or const = non-const" );
+
+      static_assert( ViewDimensionAssignable< typename DstTraits::dimension , typename SrcTraits::dimension >::value
+                   , "View assignment must have compatible dimensions" );
+
+      static_assert( std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value ||
+                     std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value ||
+                     ( DstTraits::dimension::rank == 0 ) ||
+                     ( DstTraits::dimension::rank == 1 && DstTraits::dimension::rank_dynamic == 1 )
+                   , "View assignment must have compatible layout or have rank <= 1" );
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+
+      dst.m_offset = dst_offset_type( src.m_offset );
+      dst.m_handle = Kokkos::Experimental::Impl::ViewDataHandle< DstTraits >::assign( src.m_handle , src_track );
+    }
+};
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+/** \brief  View mapping for non-specialized data type and standard layout */
+template< class Traits , bool R0 , bool R1 , bool R2 , bool R3 , bool R4 , bool R5 , bool R6 , bool R7 >
+struct SubviewMapping< Traits, R0, R1, R2, R3, R4, R5, R6, R7 ,
+  typename std::enable_if<(
+    std::is_same< typename Traits::specialize , void >::value
+    &&
+    (
+      std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value ||
+      std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value ||
+      std::is_same< typename Traits::array_layout , Kokkos::LayoutStride >::value
+    )
+  )>::type >
+{
+private:
+
+  // Subview's rank
+  enum { rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+              + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
+
+  // Whether right-most rank is a range.
+  enum { R0_rev = 0 == Traits::rank ? false : (
+                  1 == Traits::rank ? R0 : (
+                  2 == Traits::rank ? R1 : (
+                  3 == Traits::rank ? R2 : (
+                  4 == Traits::rank ? R3 : (
+                  5 == Traits::rank ? R4 : (
+                  6 == Traits::rank ? R5 : (
+                  7 == Traits::rank ? R6 : R7 ))))))) };
+
+  // Subview's layout
+  typedef typename std::conditional<
+      ( /* Same array layout IF */
+        ( rank == 0 ) /* output rank zero */
+        ||
+        // OutputRank 1 or 2, InputLayout Left, Interval 0
+        // because single stride one or second index has a stride.
+        ( rank <= 2 && R0 && std::is_same< typename Traits::array_layout , Kokkos::LayoutLeft >::value )
+        ||
+        // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
+        // because single stride one or second index has a stride.
+        ( rank <= 2 && R0_rev && std::is_same< typename Traits::array_layout , Kokkos::LayoutRight >::value )
+      ), typename Traits::array_layout , Kokkos::LayoutStride
+      >::type array_layout ;
+
+  typedef typename Traits::value_type  value_type ;
+
+  typedef typename std::conditional< rank == 0 , value_type ,
+          typename std::conditional< rank == 1 , value_type * ,
+          typename std::conditional< rank == 2 , value_type ** ,
+          typename std::conditional< rank == 3 , value_type *** ,
+          typename std::conditional< rank == 4 , value_type **** ,
+          typename std::conditional< rank == 5 , value_type ***** ,
+          typename std::conditional< rank == 6 , value_type ****** ,
+          typename std::conditional< rank == 7 , value_type ******* ,
+                                                 value_type ********
+          >::type >::type >::type >::type >::type >::type >::type >::type
+     data_type ;
+
+public:
+
+  typedef 
+    Kokkos::Experimental::ViewTraits< data_type , array_layout
+                                    , typename Traits::device_type
+                                    , typename Traits::memory_traits > traits_type ;
+
+  typedef Kokkos::Experimental::View< data_type
+                                    , array_layout
+                                    , typename Traits::device_type
+                                    , typename Traits::memory_traits > type ;
+
+  template< class T0 , class T1 , class T2 , class T3
+          , class T4 , class T5 , class T6 , class T7 >
+  KOKKOS_INLINE_FUNCTION
+  static void assign( ViewMapping< traits_type , void , void > & dst
+                    , ViewMapping< Traits , void , void > const & src
+                    , T0 const & arg0
+                    , T1 const & arg1
+                    , T2 const & arg2
+                    , T3 const & arg3
+                    , T4 const & arg4
+                    , T5 const & arg5
+                    , T6 const & arg6
+                    , T7 const & arg7
+                    )
+    {
+      typedef ViewMapping< traits_type , void , void >  DstType ;
+
+      typedef typename DstType::offset_type  dst_offset_type ;
+      typedef typename DstType::handle_type  dst_handle_type ;
+
+      typedef Kokkos::Experimental::Impl::ViewOffsetRange<T0>  V0 ;
+      typedef Kokkos::Experimental::Impl::ViewOffsetRange<T1>  V1 ;
+      typedef Kokkos::Experimental::Impl::ViewOffsetRange<T2>  V2 ;
+      typedef Kokkos::Experimental::Impl::ViewOffsetRange<T3>  V3 ;
+      typedef Kokkos::Experimental::Impl::ViewOffsetRange<T4>  V4 ;
+      typedef Kokkos::Experimental::Impl::ViewOffsetRange<T5>  V5 ;
+      typedef Kokkos::Experimental::Impl::ViewOffsetRange<T6>  V6 ;
+      typedef Kokkos::Experimental::Impl::ViewOffsetRange<T7>  V7 ;
+
+      dst.m_offset = dst_offset_type
+        ( src.m_offset
+        , V0::dimension( src.m_offset.dimension_0() , arg0 )
+        , V1::dimension( src.m_offset.dimension_1() , arg1 )
+        , V2::dimension( src.m_offset.dimension_2() , arg2 )
+        , V3::dimension( src.m_offset.dimension_3() , arg3 )
+        , V4::dimension( src.m_offset.dimension_4() , arg4 )
+        , V5::dimension( src.m_offset.dimension_5() , arg5 )
+        , V6::dimension( src.m_offset.dimension_6() , arg6 )
+        , V7::dimension( src.m_offset.dimension_7() , arg7 )
+        );
+
+      dst.m_handle = dst_handle_type( src.m_handle +
+                                      src.m_offset( V0::begin( arg0 )
+                                                  , V1::begin( arg1 )
+                                                  , V2::begin( arg2 )
+                                                  , V3::begin( arg3 )
+                                                  , V4::begin( arg4 )
+                                                  , V5::begin( arg5 )
+                                                  , V6::begin( arg6 )
+                                                  , V7::begin( arg7 )
+                                                  ) );
+    }
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+template< class V
+        , bool R0 = false , bool R1 = false , bool R2 = false , bool R3 = false
+        , bool R4 = false , bool R5 = false , bool R6 = false , bool R7 = false >
+struct SubviewType ;
+
+template< class D , class A1, class A2, class A3
+        , bool R0 , bool R1 , bool R2 , bool R3
+        , bool R4 , bool R5 , bool R6 , bool R7 >
+struct SubviewType< Kokkos::Experimental::View< D , A1, A2, A3 > , R0 , R1 , R2 , R3 , R4 , R5 , R6 , R7 >
+{
+private:
+  typedef Kokkos::Experimental::ViewTraits< D , A1 , A2 , A3 >  traits ;
+  typedef Kokkos::Experimental::Impl::SubviewMapping< traits , R0 , R1 , R2 , R3 , R4 , R5 , R6 , R7 >  mapping ;
+public:
+  typedef typename mapping::type  type ;
+};
+
+}}} // namespace Kokkos::Experimental::Impl
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+class Error_view_scalar_reference_to_non_scalar_view ;
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+#if defined( KOKKOS_EXPRESSION_CHECK )
+
+#define KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( SPACE , MAP , RANK , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace , SPACE >::verify( MAP.data() ); \
+  /* array bounds checking */
+
+#else
+
+#define KOKKOS_ASSERT_VIEW_MAPPING_ACCESS( SPACE , MAP , RANK , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) \
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
+    Kokkos::Impl::ActiveExecutionMemorySpace , SPACE >::verify( MAP.data() )
+
+#endif
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_EXPERIMENTAL_VIEW_MAPPING_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..7fb33853d667c829417bffda2146e4149c3cf2d2
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.cpp
@@ -0,0 +1,844 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core_fwd.hpp>
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+#include <Kokkos_Atomic.hpp>
+
+#include <impl/Kokkos_Singleton.hpp>
+#include <impl/Kokkos_AllocationTracker.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+
+#include <string>
+#include <vector>
+#include <sstream>
+#include <algorithm>
+#include <utility>
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <iomanip>
+
+/* Enable clean up of memory leaks */
+#define CLEAN_UP_MEMORY_LEAKS 0
+
+namespace Kokkos { namespace Impl {
+
+namespace {
+
+
+//-----------------------------------------------------------------------------
+// AllocationRecord
+//-----------------------------------------------------------------------------
+//
+// Used to track details about an allocation and provide a ref count
+// sizeof(AllocationRecord) == 128
+struct AllocationRecord
+{
+  enum {
+     OFFSET = sizeof(AllocatorBase*)          // allocator
+            + sizeof(void*)                   // alloc_ptr
+            + sizeof(uint64_t)                // alloc_size
+            + sizeof(AllocatorAttributeBase*) // attribute
+            + sizeof(uint32_t)                // node_index
+            + sizeof(uint32_t)                // ref_count
+   , LABEL_LENGTH = 128 - OFFSET
+  };
+
+  AllocatorBase * const          allocator;
+  void * const                   alloc_ptr;
+  const uint64_t                 alloc_size;
+  AllocatorAttributeBase * const attribute;
+  const int32_t                  node_index;
+  volatile uint32_t              ref_count;
+  const char                     label[LABEL_LENGTH];
+
+
+  AllocationRecord(  AllocatorBase * const arg_allocator
+                   , void *   arg_alloc_ptr
+                   , uint64_t arg_alloc_size
+                   , int32_t  arg_node_index
+                   , const std::string & arg_label
+                  )
+    : allocator(arg_allocator)
+    , alloc_ptr(arg_alloc_ptr)
+    , alloc_size(arg_alloc_size)
+    , attribute(NULL)
+    , node_index(arg_node_index)
+    , ref_count(1)
+    , label() // zero fill
+  {
+    const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size();
+    strncpy( const_cast<char *>(label), arg_label.c_str(), length );
+  }
+
+  ~AllocationRecord()
+  {
+    if (attribute) {
+      delete attribute;
+    }
+  }
+
+  uint32_t increment_ref_count()
+  {
+    uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) );
+    return old_value + 1u;
+  }
+
+  uint32_t decrement_ref_count()
+  {
+    uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) );
+    return old_value - 1u;
+  }
+
+  void print( std::ostream & oss ) const
+  {
+    oss << "{ " << allocator->name()
+        << " } : \"" << label
+        << "\" ref_count(" << ref_count
+        << ") memory[ " << alloc_ptr
+        << " + " << alloc_size
+        << " ]" ;
+  }
+
+  bool set_attribute( AllocatorAttributeBase * attr )
+  {
+    bool result = false;
+    if (attribute == NULL) {
+      result = NULL == atomic_compare_exchange(  const_cast<AllocatorAttributeBase **>(&attribute)
+                                               , reinterpret_cast<AllocatorAttributeBase *>(NULL)
+                                               , attr );
+    }
+
+    return result;
+  }
+
+  // disallow copy and assignment
+  AllocationRecord( const AllocationRecord & );
+  AllocationRecord & operator=(const AllocationRecord &);
+};
+
+template <int NumBlocks>
+struct Bitset
+{
+  enum { blocks = NumBlocks };
+  enum { size = blocks * 64 };
+  enum { block_mask = 63u };
+  enum { block_shift = 6 };
+
+  // used to find free bits in a bitset
+  static int count_trailing_zeros(uint64_t x)
+  {
+    #if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC )
+      return x ? __builtin_ctzll(x) : 64;
+    #elif defined( KOKKOS_COMPILER_INTEL )
+      enum { shift = 32 };
+      enum { mask = (static_cast<uint64_t>(1) << shift) - 1u };
+      return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) :
+             (x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) :
+             64 ;
+    #elif defined( KOKKOS_COMPILER_IBM )
+      return x ? __cnttz8(x) : 64;
+    #else
+      int i = 0;
+      for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {}
+      return i;
+    #endif
+  }
+
+  Bitset()
+    : m_bits()
+  {
+    for (int i=0; i < blocks; ++i) {
+      m_bits[i] = 0u;
+    }
+  }
+
+  bool set( int i )
+  {
+    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
+    return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit );
+  }
+
+  bool reset( int i )
+  {
+    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
+    return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit;
+  }
+
+  bool test( int i )
+  {
+    const uint64_t block = m_bits[ i >> block_shift ];
+    const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
+    return block & bit;
+  }
+
+  int find_first_unset() const
+  {
+    for (int i=0; i < blocks; ++i) {
+      const uint64_t block = m_bits[i];
+      int b = count_trailing_zeros( ~block );
+
+      if ( b < 64 ) {
+        return (i << block_shift) + b;
+      }
+    }
+    return size;
+  }
+
+  volatile uint64_t m_bits[blocks];
+};
+
+//-----------------------------------------------------------------------------
+// AllocationRecordPool -- singleton class
+//
+// global_alloc_rec_pool is the ONLY instance of this class
+//
+//-----------------------------------------------------------------------------
+// Record AllocationRecords in a lock-free circular list.
+// Each node in the list has a buffer with space for 959 ((15*64)-1) records
+// managed by a bitset.  Atomics are used to set and reset bits in the bit set.
+// The head of the list is atomically updated to the last node found with
+// unused space.
+//
+// Cost time to create an allocation record: amortized O(1), worst case O(num nodes)
+// Cost to destroy an allocation recored: O(1)
+//
+// Singleton allocations are pushed onto a lock-free stack that is destroyed
+// after the circular list of allocation records.
+struct AllocationRecordPool
+{
+  enum { BITSET_BLOCKS = 15 };
+
+  typedef Bitset<BITSET_BLOCKS> bitset_type;
+
+  enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) };
+
+  struct AllocationNode
+  {
+    AllocationNode()
+      : next()
+      , bitset()
+      , buffer()
+    {
+      // set the first bit to used
+      bitset.set(0);
+    }
+
+    void * get_buffer( int32_t node_index )
+    {
+      return buffer + (node_index-1) * sizeof(AllocationRecord);
+    }
+
+    // return 0 if no space is available in the node
+    int32_t get_node_index()
+    {
+      int32_t node_index = 0;
+      do {
+        node_index = bitset.find_first_unset();
+
+        // successfully claimed a bit
+        if ( node_index != bitset.size && bitset.set(node_index) )
+        {
+          return node_index;
+        }
+      } while ( node_index != bitset.size );
+      return 0;
+    }
+
+    void clear_node_index( int32_t node_index )
+    {
+      bitset.reset(node_index);
+    }
+
+    AllocationNode * next;
+    bitset_type      bitset;
+    char             buffer[BUFFER_SIZE];
+  };
+
+  struct SingletonNode
+  {
+    void * buffer;
+    SingletonNode * next;
+    Impl::singleton_destroy_function_type destroy;
+
+    SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func  )
+      : buffer(NULL)
+      , next(NULL)
+      , destroy(destroy_func)
+    {
+      if (size) {
+        buffer = malloc(size);
+        create_func(buffer);
+      }
+    }
+
+    ~SingletonNode()
+    {
+      if (buffer) {
+        try {
+          destroy(buffer);
+        } catch(...) {}
+        free(buffer);
+      }
+    }
+  };
+
+  AllocationRecordPool()
+    : head( new AllocationNode() )
+    , singleton_head(NULL)
+  {
+    // setup ring
+    head->next = head;
+  }
+
+  ~AllocationRecordPool()
+  {
+    // delete allocation records
+    {
+      AllocationNode * start = head;
+
+      AllocationNode * curr = start;
+
+      std::vector< std::string > string_vec;
+
+      do {
+        AllocationNode * next = curr->next;
+
+        #if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET )
+        // print node bitset
+        for (int i=0; i < bitset_type::blocks; ++i ) {
+          std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << "   ";
+        }
+        std::cout << std::endl;
+        #endif
+
+        // bit zero does not map to an AllocationRecord
+        for ( int32_t i=1; i < bitset_type::size; ++i )
+        {
+          if (curr->bitset.test(i)) {
+            AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
+
+            std::ostringstream oss;
+            alloc_rec->print( oss );
+            string_vec.push_back( oss.str() );
+
+#if CLEAN_UP_MEMORY_LEAKS
+/* Cleaning up memory leaks prevents memory error detection tools
+ * from reporting the original source of allocation, which can
+ * impede debugging with such tools.
+ */
+            try {
+              destroy(alloc_rec);
+            }
+            catch(...) {}
+#endif
+          }
+        }
+
+        curr->next = NULL;
+
+        delete curr;
+
+        curr = next;
+      } while ( curr != start );
+
+      if ( !string_vec.empty() ) {
+        std::sort( string_vec.begin(), string_vec.end() );
+
+        std::ostringstream oss;
+        oss << "Error: Allocation pool destroyed with the following memory leak(s):\n";
+        for (size_t i=0; i< string_vec.size(); ++i)
+        {
+          oss << "   " << string_vec[i] << std::endl;
+        }
+
+        std::cerr << oss.str() << std::endl;
+      }
+    }
+
+    // delete singletons
+    {
+      SingletonNode * curr = singleton_head;
+
+      while (curr) {
+        SingletonNode * next = curr->next;
+        delete curr;
+        curr = next;
+      }
+    }
+  }
+
+  AllocationRecord * create(  AllocatorBase * arg_allocator
+                            , void * arg_alloc_ptr
+                            , size_t arg_alloc_size
+                            , const std::string & arg_label
+                           )
+  {
+    AllocationNode * start = volatile_load(&head);
+
+    AllocationNode * curr = start;
+
+
+    int32_t node_index = curr->get_node_index();
+
+    if (node_index == 0) {
+      curr = volatile_load(&curr->next);
+    }
+
+    while (node_index == 0 && curr != start)
+    {
+      node_index = curr->get_node_index();
+      if (node_index == 0) {
+        curr = volatile_load(&curr->next);
+      }
+    }
+
+    // Need to allocate and insert a new node
+    if (node_index == 0 && curr == start)
+    {
+      AllocationNode * new_node = new AllocationNode();
+
+      node_index = new_node->get_node_index();
+
+      AllocationNode * next = NULL;
+      do {
+        next = volatile_load(&curr->next);
+        new_node->next = next;
+        memory_fence();
+      } while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) );
+
+      curr = new_node;
+    }
+
+    void * buffer = curr->get_buffer(node_index);
+
+    // try to set head to curr
+    if ( start != curr )
+    {
+      atomic_compare_exchange( & head, start, curr );
+    }
+
+    return new (buffer) AllocationRecord(  arg_allocator
+                                         , arg_alloc_ptr
+                                         , arg_alloc_size
+                                         , node_index
+                                         , arg_label
+                                        );
+  }
+
+  void destroy( AllocationRecord * alloc_rec )
+  {
+    if (alloc_rec) {
+      const int32_t node_index = alloc_rec->node_index;
+      AllocationNode * node = get_node( alloc_rec );
+
+      // deallocate memory
+      alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size );
+
+      // call destructor
+      alloc_rec->~AllocationRecord();
+
+      // wait for writes to complete
+      memory_fence();
+
+      // clear node index
+      node->clear_node_index( node_index );
+    }
+  }
+
+  void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
+  {
+    SingletonNode * node = new SingletonNode( size, create_func, destroy_func );
+    SingletonNode * next;
+
+    // insert new node at the head of the list
+    do {
+      next = volatile_load(&singleton_head);
+      node->next = next;
+    } while ( next != atomic_compare_exchange( &singleton_head, next, node ) );
+
+    return node->buffer;
+  }
+
+  void print_memory( std::ostream & out ) const
+  {
+    AllocationNode * start = head;
+
+    AllocationNode * curr = start;
+
+    std::vector< std::string > string_vec;
+
+    do {
+      AllocationNode * next = curr->next;
+
+      // bit zero does not map to an AllocationRecord
+      for ( int32_t i=1; i < bitset_type::size; ++i )
+      {
+        if (curr->bitset.test(i)) {
+          AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
+
+          std::ostringstream oss;
+          alloc_rec->print( oss );
+          string_vec.push_back( oss.str() );
+        }
+      }
+      curr = next;
+    } while ( curr != start );
+
+    if ( !string_vec.empty() ) {
+      std::sort( string_vec.begin(), string_vec.end() );
+
+      std::ostringstream oss;
+      oss << "Tracked Memory:" << std::endl;
+      for (size_t i=0; i< string_vec.size(); ++i)
+      {
+        oss << "   " << string_vec[i] << std::endl;
+      }
+      out << oss.str() << std::endl;
+    }
+    else {
+      out << "No Tracked Memory" << std::endl;
+    }
+  }
+
+  // find an AllocationRecord such that
+  // alloc_ptr <= ptr < alloc_ptr + alloc_size
+  // otherwise return NULL
+  AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const
+  {
+    AllocationNode * start = head;
+
+    AllocationNode * curr = start;
+
+    char const * const char_ptr = reinterpret_cast<const char *>(ptr);
+
+    do {
+      AllocationNode * next = curr->next;
+
+      // bit zero does not map to an AllocationRecord
+      for ( int32_t i=1; i < bitset_type::size; ++i )
+      {
+        if (curr->bitset.test(i)) {
+          AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
+
+          char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr);
+
+          if (   (allocator == alloc_rec->allocator)
+              && (alloc_ptr <= char_ptr)
+              && (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) )
+          {
+            return alloc_rec;
+          }
+        }
+      }
+      curr = next;
+    } while ( curr != start );
+
+    return NULL;
+  }
+
+private:
+
+  AllocationNode * get_node( AllocationRecord * alloc_rec )
+  {
+    return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index);
+  }
+
+  AllocationNode * head;
+  SingletonNode * singleton_head;
+};
+
+// create the global pool for allocation records
+AllocationRecordPool global_alloc_rec_pool;
+
+
+
+// convert a uintptr_t to an AllocationRecord pointer
+inline
+AllocationRecord * to_alloc_rec( uintptr_t alloc_rec )
+{
+  return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) );
+}
+
+} // unnamed namespace
+
+//-----------------------------------------------------------------------------
+// Allocation Tracker methods
+//-----------------------------------------------------------------------------
+
+// Create a reference counted AllocationTracker
+void AllocationTracker::initalize(  AllocatorBase * arg_allocator
+                                  , void * arg_alloc_ptr
+                                  , size_t arg_alloc_size
+                                  , const std::string & arg_label
+                                 )
+{
+  if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) {
+    // create record
+    AllocationRecord * alloc_rec = global_alloc_rec_pool.create(  arg_allocator
+                                                                , arg_alloc_ptr
+                                                                , arg_alloc_size
+                                                                , arg_label
+                                                               );
+
+    m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
+  }
+}
+
+void AllocationTracker::reallocate( size_t size ) const
+{
+  AllocationRecord * rec = to_alloc_rec( m_alloc_rec );
+
+  void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size );
+
+  if ( NULL != the_alloc_ptr )
+  {
+    *const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr;
+    *const_cast<uint64_t *>(&rec->alloc_size) = size;
+  }
+  else {
+    Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker");
+  }
+}
+
+
+void AllocationTracker::increment_ref_count() const
+{
+  to_alloc_rec( m_alloc_rec )->increment_ref_count();
+}
+
+
+void AllocationTracker::decrement_ref_count() const
+{
+  AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec );
+  uint32_t the_ref_count = alloc_rec->decrement_ref_count();
+  if (the_ref_count == 0u) {
+    try {
+      global_alloc_rec_pool.destroy( alloc_rec );
+    }
+    catch(...) {}
+  }
+}
+
+namespace {
+
+struct NullAllocator { static const char * name() { return "Null Allocator"; } };
+
+}
+
+AllocatorBase * AllocationTracker::allocator() const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->allocator;
+  }
+  return Allocator<NullAllocator>::singleton();
+}
+
+void * AllocationTracker::alloc_ptr()  const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->alloc_ptr;
+  }
+  return NULL;
+}
+
+size_t AllocationTracker::alloc_size() const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->alloc_size;
+  }
+  return 0u;
+}
+
+size_t AllocationTracker::ref_count()  const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->ref_count;
+  }
+  return 0u;
+}
+
+char const * AllocationTracker::label() const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->label;
+  }
+  return "[Empty Allocation Tracker]";
+}
+
+void AllocationTracker::print( std::ostream & oss) const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    to_alloc_rec(m_alloc_rec)->print(oss);
+  }
+  else {
+    oss << label();
+  }
+}
+
+bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const
+{
+  bool result = false;
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    result = to_alloc_rec(m_alloc_rec)->set_attribute(attr);
+  }
+  return result;
+}
+
+AllocatorAttributeBase * AllocationTracker::attribute() const
+{
+  if (m_alloc_rec & REF_COUNT_MASK) {
+    return to_alloc_rec(m_alloc_rec)->attribute;
+  }
+  return NULL;
+}
+
+void AllocationTracker::print_tracked_memory( std::ostream & out )
+{
+  global_alloc_rec_pool.print_memory( out );
+}
+
+
+AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator )
+{
+  AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator);
+
+  AllocationTracker tracker;
+
+  if ( alloc_rec != NULL )
+  {
+    if ( tracking_enabled() ) {
+      alloc_rec->increment_ref_count();
+      tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
+    }
+    else {
+      tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec);
+    }
+  }
+
+  return tracker ;
+}
+
+
+
+//-----------------------------------------------------------------------------
+// static AllocationTracker
+//-----------------------------------------------------------------------------
+#if defined( KOKKOS_USE_DECENTRALIZED_HOST )
+namespace {
+
+  // TODO : Detect compiler support for thread local variables
+  #if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+    bool g_thread_local_tracking_enabled = true;
+    #pragma omp threadprivate(g_thread_local_tracking_enabled)
+  #elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+    __thread bool g_thread_local_tracking_enabled = true;
+  #elif defined( KOKKOS_HAVE_OPENMP )
+    bool g_thread_local_tracking_enabled = true;
+    #pragma omp threadprivate(g_thread_local_tracking_enabled)
+  #elif defined( KOKKOS_HAVE_PTHREAD )
+    __thread bool g_thread_local_tracking_enabled = true;
+  #elif defined( KOKKOS_HAVE_SERIAL )
+      bool g_thread_local_tracking_enabled = true;
+  #endif
+} // unnamed namespace
+
+void AllocationTracker::disable_tracking()
+{
+  g_thread_local_tracking_enabled = false;
+}
+
+void AllocationTracker::enable_tracking()
+{
+  g_thread_local_tracking_enabled = true;
+}
+
+bool AllocationTracker::tracking_enabled()
+{
+  return g_thread_local_tracking_enabled;
+}
+#else
+namespace {
+enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED };
+volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED;
+}
+
+void AllocationTracker::disable_tracking()
+{
+  if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) {
+    Impl::throw_runtime_exception("Error: Tracking already disabled");
+  }
+}
+
+void AllocationTracker::enable_tracking()
+{
+  if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) {
+    Impl::throw_runtime_exception("Error: Tracking already enabled");
+  }
+}
+
+bool AllocationTracker::tracking_enabled()
+{
+  return g_tracking_enabled == TRACKING_ENABLED;
+}
+#endif
+
+
+//-----------------------------------------------------------------------------
+// create singleton free function
+//-----------------------------------------------------------------------------
+void * create_singleton(  size_t size
+                        , Impl::singleton_create_function_type create_func
+                        , Impl::singleton_destroy_function_type destroy_func )
+{
+  return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func );
+}
+
+}} // namespace Kokkos::Impl
+
+#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..331c4e8facb1e0951082cd9a715a019ee3f0c5cd
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_AllocationTracker.hpp
@@ -0,0 +1,586 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ALLOCATION_TRACKER_HPP
+#define KOKKOS_ALLOCATION_TRACKER_HPP
+
+#include <Kokkos_Macros.hpp>
+
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#include <stdint.h>
+#include <cstdlib>
+#include <string>
+#include <iosfwd>
+
+namespace Kokkos { namespace Impl {
+
+//-----------------------------------------------------------------------------
+// Create Singleton objects
+//-----------------------------------------------------------------------------
+
+typedef void * (*singleton_create_function_type)(void * buffer);
+typedef void (*singleton_destroy_function_type)(void *);
+
+void * create_singleton(  size_t size
+                        , singleton_create_function_type create_func
+                        , singleton_destroy_function_type destroy_func
+                       );
+
+
+
+/// class Singleton
+///
+/// Default construct a singleton type.  This method is used to circumvent
+/// order of construction issues.  Singleton objects are destroyed after all
+/// other allocations in the reverse order of their creation.
+template <typename Type>
+class Singleton
+{
+public:
+  /// Get a pointer to the Singleton. Default construct the singleton if it does not already exist
+  static Type * get()
+  {
+    static Type * singleton = NULL;
+    if (singleton == NULL) {
+      Impl::singleton_create_function_type  create_func = &create;
+      Impl::singleton_destroy_function_type destroy_func = &destroy;
+      singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) );
+    }
+    return singleton;
+  }
+
+private:
+
+  /// Call the Type constructor
+  static void destroy(void * ptr)
+  {
+    reinterpret_cast<Type*>(ptr)->~Type();
+  }
+
+  /// placement new the Type in buffer
+  static void * create(void * buffer)
+  {
+    return new (buffer) Type();
+  }
+};
+
+
+//-----------------------------------------------------------------------------
+// AllocatorBase
+//-----------------------------------------------------------------------------
+
+/// class AllocatorBase
+///
+/// Abstract base class for all Allocators.
+/// Allocators should be singleton objects, use Singleton<Allocator>::get to create
+/// to avoid order of destruction issues
+class AllocatorBase
+{
+public:
+  /// name of the allocator
+  /// used to report memory leaks
+  virtual const char * name() const = 0;
+
+  /// Allocate a buffer of size number of bytes
+  virtual void* allocate(size_t size) const = 0;
+
+  /// Deallocate a buffer with size number of bytes
+  /// The pointer must have been allocated with a call to corresponding allocate
+  virtual void deallocate(void * ptr, size_t size) const = 0;
+
+  /// Changes the size of the memory block pointed to by ptr.
+  /// Ptr must have been allocated with the corresponding allocate call
+  /// The function may move the memory block to a new location
+  /// (whose address is returned by the function).
+  ///
+  /// The content of the memory block is preserved up to the lesser of the new and
+  /// old sizes, even if the block is moved to a new location. If the new size is larger,
+  /// the value of the newly allocated portion is indeterminate.
+  ///
+  /// In case that ptr is a null pointer, the function behaves like allocate, assigning a
+  /// new block of size bytes and returning a pointer to its beginning.
+  virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0;
+
+  /// can a texture object be bound to the allocated memory
+  virtual bool support_texture_binding() const = 0;
+
+  /// virtual destructor
+  virtual ~AllocatorBase() {}
+};
+
+/// class AllocatorAttributeBase
+class AllocatorAttributeBase
+{
+public:
+  virtual ~AllocatorAttributeBase() {}
+};
+
+//-----------------------------------------------------------------------------
+// Allocator< StaticAllocator > : public AllocatorBase
+//-----------------------------------------------------------------------------
+
+// HasStaticName
+template<typename T>
+class HasStaticName
+{
+  typedef const char * (*static_method)();
+  template<typename U, static_method> struct SFINAE {};
+  template<typename U> static char Test(SFINAE<U, &U::name>*);
+  template<typename U> static int Test(...);
+public:
+  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
+};
+
+
+template <typename T>
+inline
+typename enable_if<HasStaticName<T>::value, const char *>::type
+allocator_name()
+{
+  return T::name();
+}
+
+template <typename T>
+inline
+typename enable_if<!HasStaticName<T>::value, const char *>::type
+allocator_name()
+{
+  return "Unnamed Allocator";
+}
+
+
+// HasStaticAllocate
+template<typename T>
+class HasStaticAllocate
+{
+  typedef void * (*static_method)(size_t);
+  template<typename U, static_method> struct SFINAE {};
+  template<typename U> static char Test(SFINAE<U, &U::allocate>*);
+  template<typename U> static int Test(...);
+public:
+  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+inline
+typename enable_if<HasStaticAllocate<T>::value, void *>::type
+allocator_allocate(size_t size)
+{
+  return T::allocate(size);
+}
+
+template <typename T>
+inline
+typename enable_if<!HasStaticAllocate<T>::value, void *>::type
+allocator_allocate(size_t)
+{
+  throw_runtime_exception(  std::string("Error: ")
+                          + std::string(allocator_name<T>())
+                          + std::string(" cannot allocate memory!") );
+  return NULL;
+}
+
+// HasStaticDeallocate
+template<typename T>
+class HasStaticDeallocate
+{
+  typedef void (*static_method)(void *, size_t);
+  template<typename U, static_method> struct SFINAE {};
+  template<typename U> static char Test(SFINAE<U, &U::deallocate>*);
+  template<typename U> static int Test(...);
+public:
+  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+inline
+typename enable_if<HasStaticDeallocate<T>::value, void>::type
+allocator_deallocate(void * ptr, size_t size)
+{
+  T::deallocate(ptr,size);
+}
+
+template <typename T>
+inline
+typename enable_if<!HasStaticDeallocate<T>::value, void>::type
+allocator_deallocate(void *, size_t)
+{
+  throw_runtime_exception(  std::string("Error: ")
+                          + std::string(allocator_name<T>())
+                          + std::string(" cannot deallocate memory!") );
+}
+
+// HasStaticReallocate
+template<typename T>
+class HasStaticReallocate
+{
+  typedef void * (*static_method)(void *, size_t, size_t);
+  template<typename U, static_method> struct SFINAE {};
+  template<typename U> static char Test(SFINAE<U, &U::reallocate>*);
+  template<typename U> static int Test(...);
+public:
+  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+inline
+typename enable_if<HasStaticReallocate<T>::value, void *>::type
+allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  return T::reallocate(old_ptr, old_size, new_size);
+}
+
+template <typename T>
+inline
+typename enable_if<!HasStaticReallocate<T>::value, void *>::type
+allocator_reallocate(void *, size_t, size_t)
+{
+  throw_runtime_exception(  std::string("Error: ")
+                          + std::string(allocator_name<T>())
+                          + std::string(" cannot reallocate memory!") );
+  return NULL;
+}
+
+// HasStaticReallocate
+template<typename T>
+class HasStaticSupportTextureBinding
+{
+  typedef bool (*static_method)();
+  template<typename U, static_method> struct SFINAE {};
+  template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*);
+  template<typename U> static int Test(...);
+public:
+  enum { value = sizeof(Test<T>(0)) == sizeof(char) };
+};
+
+template <typename T>
+inline
+typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type
+allocator_support_texture_binding()
+{
+  return T::support_texture_binding();
+}
+
+template <typename T>
+inline
+typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type
+allocator_support_texture_binding()
+{
+  return false;
+}
+
+template <typename T>
+class Allocator : public AllocatorBase
+{
+public:
+  virtual const char * name() const
+  {
+    return allocator_name<T>();
+  }
+
+  virtual void* allocate(size_t size) const
+  {
+    return allocator_allocate<T>(size);
+  }
+
+  virtual void deallocate(void * ptr, size_t size) const
+  {
+    allocator_deallocate<T>(ptr,size);
+  }
+
+  virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const
+  {
+    return allocator_reallocate<T>(old_ptr, old_size, new_size);
+  }
+
+  virtual bool support_texture_binding() const
+  {
+    return allocator_support_texture_binding<T>();
+  }
+
+  static AllocatorBase * singleton()
+  {
+    return Singleton< Allocator<T> >::get();
+  }
+};
+
+//-----------------------------------------------------------------------------
+// AllocationTracker
+//-----------------------------------------------------------------------------
+
+// forward declaration for friend classes
+struct CopyWithoutTracking;
+struct MallocHelper;
+
+/// class AllocationTracker
+/// Will call deallocate from the AllocatorBase when the reference count reaches 0.
+/// Reference counting is disabled when the host is in parallel.
+class AllocationTracker
+{
+  // use the least significant bit of the AllocationRecord pointer to indicate if the
+  // AllocationTracker should reference count
+  enum {
+     REF_COUNT_BIT = static_cast<uintptr_t>(1)
+   , REF_COUNT_MASK = ~static_cast<uintptr_t>(1)
+  };
+
+public:
+
+  /// Find an AllocationTracker such that
+  /// alloc_ptr <= ptr < alloc_ptr + alloc_size
+  /// O(n) where n is the number of tracked allocations.
+  template <typename StaticAllocator>
+  static AllocationTracker find( void const * ptr )
+  {
+    return find( ptr, Allocator<StaticAllocator>::singleton() );
+  }
+
+
+  /// Pretty print all the currently tracked memory
+  static void print_tracked_memory( std::ostream & out );
+
+  /// Default constructor
+  KOKKOS_INLINE_FUNCTION
+  AllocationTracker()
+    : m_alloc_rec(0)
+  {}
+
+  /// Create a AllocationTracker
+  ///
+  /// Start reference counting the alloc_ptr.
+  /// When the reference count reachs 0 the allocator deallocate method
+  /// will be call with the given size.  The alloc_ptr should have been
+  /// allocated with the allocator's allocate method.
+  ///
+  /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
+  /// do nothing
+  template <typename StaticAllocator>
+  AllocationTracker(  StaticAllocator const &
+                    , void * arg_alloc_ptr
+                    , size_t arg_alloc_size
+                    , const std::string & arg_label = std::string("") )
+    : m_alloc_rec(0)
+  {
+    AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
+    initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
+  }
+
+  /// Create a AllocationTracker
+  ///
+  /// Start reference counting the alloc_ptr.
+  /// When the reference count reachs 0 the allocator deallocate method
+  /// will be call with the given size.  The alloc_ptr should have been
+  /// allocated with the allocator's allocate method.
+  ///
+  /// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
+  /// do nothing
+  template <typename StaticAllocator>
+  AllocationTracker(  StaticAllocator const &
+                    , size_t arg_alloc_size
+                    , const std::string & arg_label = std::string("")
+                   )
+    : m_alloc_rec(0)
+  {
+    AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
+    void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size );
+
+    initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
+  }
+
+  /// Copy an AllocatorTracker
+  KOKKOS_INLINE_FUNCTION
+  AllocationTracker( const AllocationTracker & rhs )
+    : m_alloc_rec( rhs.m_alloc_rec)
+  {
+#if !defined( __CUDA_ARCH__ )
+    if ( rhs.ref_counting() && tracking_enabled() ) {
+      increment_ref_count();
+    }
+    else {
+      m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
+    }
+#else
+    m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
+#endif
+  }
+
+  /// Copy an AllocatorTracker
+  /// Decrement the reference count of the current tracker if necessary
+  KOKKOS_INLINE_FUNCTION
+  AllocationTracker & operator=( const AllocationTracker & rhs )
+  {
+    if (this != &rhs) {
+#if !defined( __CUDA_ARCH__ )
+      if ( ref_counting() ) {
+        decrement_ref_count();
+      }
+
+      m_alloc_rec = rhs.m_alloc_rec;
+
+      if ( rhs.ref_counting() && tracking_enabled() ) {
+        increment_ref_count();
+      }
+      else {
+        m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
+      }
+#else
+      m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK;
+#endif
+    }
+
+    return * this;
+  }
+
+  /// Destructor
+  /// Decrement the reference count if necessary
+  KOKKOS_INLINE_FUNCTION
+  ~AllocationTracker()
+  {
+#if !defined( __CUDA_ARCH__ )
+    if ( ref_counting() ) {
+      decrement_ref_count();
+    }
+#endif
+  }
+
+  /// Is the tracker valid?
+  KOKKOS_INLINE_FUNCTION
+  bool is_valid() const
+  {
+    return (m_alloc_rec & REF_COUNT_MASK);
+  }
+
+
+
+  /// clear the tracker
+  KOKKOS_INLINE_FUNCTION
+  void clear()
+  {
+#if !defined( __CUDA_ARCH__ )
+    if ( ref_counting() ) {
+      decrement_ref_count();
+    }
+#endif
+    m_alloc_rec = 0;
+  }
+
+  /// is this tracker currently counting allocations?
+  KOKKOS_INLINE_FUNCTION
+  bool ref_counting() const
+  {
+    return (m_alloc_rec & REF_COUNT_BIT);
+  }
+
+  AllocatorBase * allocator() const;
+
+  /// pointer to the allocated memory
+  void * alloc_ptr()  const;
+
+  /// size in bytes of the allocated memory
+  size_t alloc_size() const;
+
+  /// the current reference count
+  size_t ref_count()  const;
+
+  /// the label given to the allocation
+  char const * label() const;
+
+  /// pretty print all the tracker's information to the std::ostream
+  void print( std::ostream & oss) const;
+
+
+  /// set an attribute ptr on the allocation record
+  /// the arg_attribute pointer will be deleted when the record is destroyed
+  /// the attribute ptr can only be set once
+  bool set_attribute( AllocatorAttributeBase * arg_attribute) const;
+
+  /// get the attribute ptr from the allocation record
+  AllocatorAttributeBase * attribute() const;
+
+
+  /// reallocate the memory tracked by this allocation
+  /// NOT thread-safe
+  void reallocate( size_t size ) const;
+
+private:
+
+  static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
+
+  void initalize(  AllocatorBase * arg_allocator
+                 , void * arg_alloc_ptr
+                 , size_t arg_alloc_size
+                 , std::string const & label );
+
+  void increment_ref_count() const;
+  void decrement_ref_count() const;
+
+  static void disable_tracking();
+  static void enable_tracking();
+  static bool tracking_enabled();
+
+  friend struct Impl::CopyWithoutTracking;
+  friend struct Impl::MallocHelper;
+
+  uintptr_t m_alloc_rec;
+};
+
+
+
+/// Make a copy of the functor with reference counting disabled
+struct CopyWithoutTracking
+{
+  template <typename Functor>
+  static Functor apply( const Functor & f )
+  {
+    AllocationTracker::disable_tracking();
+    Functor func(f);
+    AllocationTracker::enable_tracking();
+    return func;
+  }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_ALLOCATION_TRACKER_HPP
diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..2de9df008ee5b42b5d38727ead56bae768869c43
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzeShape.hpp
@@ -0,0 +1,260 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_ANALYZESHAPE_HPP
+#define KOKKOS_ANALYZESHAPE_HPP
+
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+/** \brief  Analyze the array shape defined by a Kokkos::View data type.
+ *
+ *  It is presumed that the data type can be mapped down to a multidimensional
+ *  array of an intrinsic scalar numerical type (double, float, int, ... ).
+ *  The 'value_type' of an array may be an embedded aggregate type such
+ *  as a fixed length array 'Array<T,N>'.
+ *  In this case the 'array_intrinsic_type' represents the
+ *  underlying array of intrinsic scalar numerical type.
+ *
+ *  The embedded aggregate type must have an AnalyzeShape specialization
+ *  to map it down to a shape and intrinsic scalar numerical type.
+ */
+template< class T >
+struct AnalyzeShape : public Shape< sizeof(T) , 0 >
+{
+  typedef void specialize ;
+
+  typedef Shape< sizeof(T), 0 >  shape ;
+
+  typedef       T  array_intrinsic_type ;
+  typedef       T  value_type ;
+  typedef       T  type ;
+
+  typedef const T  const_array_intrinsic_type ;
+  typedef const T  const_value_type ;
+  typedef const T  const_type ;
+
+  typedef       T  non_const_array_intrinsic_type ;
+  typedef       T  non_const_value_type ;
+  typedef       T  non_const_type ;
+};
+
+template<>
+struct AnalyzeShape<void> : public Shape< 0 , 0 >
+{
+  typedef void specialize ;
+
+  typedef Shape< 0 , 0 >  shape ;
+
+  typedef       void  array_intrinsic_type ;
+  typedef       void  value_type ;
+  typedef       void  type ;
+  typedef const void  const_array_intrinsic_type ;
+  typedef const void  const_value_type ;
+  typedef const void  const_type ;
+  typedef       void  non_const_array_intrinsic_type ;
+  typedef       void  non_const_value_type ;
+  typedef       void  non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< const T > : public AnalyzeShape<T>::shape
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename nested::shape shape ;
+
+  typedef typename nested::const_array_intrinsic_type  array_intrinsic_type ;
+  typedef typename nested::const_value_type            value_type ;
+  typedef typename nested::const_type                  type ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< T * >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_intrinsic_type * array_intrinsic_type ;
+  typedef typename nested::value_type             value_type ;
+  typedef typename nested::type                 * type ;
+
+  typedef typename nested::const_array_intrinsic_type * const_array_intrinsic_type ;
+  typedef typename nested::const_value_type             const_value_type ;
+  typedef typename nested::const_type                 * const_type ;
+
+  typedef typename nested::non_const_array_intrinsic_type * non_const_array_intrinsic_type ;
+  typedef typename nested::non_const_value_type             non_const_value_type ;
+  typedef typename nested::non_const_type                 * non_const_type ;
+};
+
+template< class T >
+struct AnalyzeShape< T[] >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [] ;
+};
+
+template< class T >
+struct AnalyzeShape< const T[] >
+  : public ShapeInsert< typename AnalyzeShape< const T >::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape< const T > nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [] ;
+};
+
+template< class T , unsigned N >
+struct AnalyzeShape< T[N] >
+  : public ShapeInsert< typename AnalyzeShape<T>::shape , N >::type
+{
+private:
+  typedef AnalyzeShape<T> nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [N] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [N] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [N] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [N] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [N] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [N] ;
+};
+
+template< class T , unsigned N >
+struct AnalyzeShape< const T[N] >
+  : public ShapeInsert< typename AnalyzeShape< const T >::shape , N >::type
+{
+private:
+  typedef AnalyzeShape< const T > nested ;
+public:
+
+  typedef typename nested::specialize specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_intrinsic_type  array_intrinsic_type [N] ;
+  typedef typename nested::value_type            value_type ;
+  typedef typename nested::type                  type [N] ;
+
+  typedef typename nested::const_array_intrinsic_type  const_array_intrinsic_type [N] ;
+  typedef typename nested::const_value_type            const_value_type ;
+  typedef typename nested::const_type                  const_type [N] ;
+
+  typedef typename nested::non_const_array_intrinsic_type  non_const_array_intrinsic_type [N] ;
+  typedef typename nested::non_const_value_type            non_const_value_type ;
+  typedef typename nested::non_const_type                  non_const_type [N] ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_ANALYZESHAPE_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..e9c7a16d585060bcc76e6bb133010bf45b4ea2d5
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Assembly_X86.hpp
@@ -0,0 +1,214 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_ASSEMBLY_X86_HPP )
+#define KOKKOS_ATOMIC_ASSEMBLY_X86_HPP
+namespace Kokkos {
+
+#ifdef KOKKOS_ENABLE_ASM
+#ifndef __CUDA_ARCH__
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<char>(volatile char* a) {
+  __asm__ __volatile__(
+    "lock incb %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<short>(volatile short* a) {
+  __asm__ __volatile__(
+    "lock incw %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<int>(volatile int* a) {
+  __asm__ __volatile__(
+    "lock incl %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment<long long int>(volatile long long int* a) {
+  __asm__ __volatile__(
+    "lock incq %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<char>(volatile char* a) {
+  __asm__ __volatile__(
+    "lock decb %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<short>(volatile short* a) {
+  __asm__ __volatile__(
+    "lock decw %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<int>(volatile int* a) {
+  __asm__ __volatile__(
+    "lock decl %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+
+template<>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement<long long int>(volatile long long int* a) {
+  __asm__ __volatile__(
+    "lock decq %0"
+    : /* no output registers */
+    : "m" (a[0])
+    : "memory"
+  );
+}
+#endif
+#endif
+
+namespace Impl {
+  struct cas128_t
+  {
+    uint64_t lower;
+    uint64_t upper;
+
+    KOKKOS_INLINE_FUNCTION
+    cas128_t () {
+      lower = 0;
+      upper = 0;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    cas128_t (const cas128_t& a) {
+      lower = a.lower;
+      upper = a.upper;
+    }
+    KOKKOS_INLINE_FUNCTION
+    cas128_t (volatile cas128_t* a) {
+      lower = a->lower;
+      upper = a->upper;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    bool operator != (const cas128_t& a) const {
+      return (lower != a.lower) || upper!=a.upper;
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator = (const cas128_t& a) {
+      lower = a.lower;
+      upper = a.upper;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator = (const cas128_t& a) volatile {
+      lower = a.lower;
+      upper = a.upper;
+    }
+  }
+  __attribute__ (( __aligned__( 16 ) ));
+
+
+
+
+  inline cas128_t cas128( volatile cas128_t * ptr, cas128_t cmp,  cas128_t swap )
+  {
+    #ifdef KOKKOS_ENABLE_ASM
+    bool swapped = false;
+    __asm__ __volatile__
+    (
+     "lock cmpxchg16b %1\n\t"
+     "setz %0"
+     : "=q" ( swapped )
+     , "+m" ( *ptr )
+     , "+d" ( cmp.upper )
+     , "+a" ( cmp.lower )
+     : "c" ( swap.upper )
+     , "b" ( swap.lower )
+     , "q" ( swapped )
+    );
+    return cmp;
+    #else
+      cas128_t tmp(ptr);
+      if(tmp !=  cmp) {
+        return tmp;
+      } else {
+        *ptr = swap;
+        return swap;
+      }
+    #endif
+  }
+
+}
+}
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..524cd7327d6f657156f45fc80b61564935582b74
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp
@@ -0,0 +1,259 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP )
+#define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+// Cuda native CAS supports int, unsigned int, and unsigned long long int (non-standard type).
+// Must cast-away 'volatile' for the CAS call.
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+__inline__ __device__
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return atomicCAS((int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val)
+{ return atomicCAS((unsigned int*)dest,compare,val); }
+
+__inline__ __device__
+unsigned long long int atomic_compare_exchange( volatile unsigned long long int * const dest ,
+                                                const unsigned long long int compare ,
+                                                const unsigned long long int val )
+{ return atomicCAS((unsigned long long int*)dest,compare,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  const int tmp = atomicCAS( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  const type tmp = atomicCAS( (type*) dest , *((type*)&compare) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_compare_exchange( volatile T * const dest , const T & compare ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      if( return_val == compare )
+        *dest = val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+}
+
+//----------------------------------------------------------------------------
+// GCC native CAS supports int, long, unsigned int, unsigned long.
+// Intel native CAS support int and long with the same interface as GCC.
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_compare_exchange( volatile int * const dest, const int compare, const int val)
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+KOKKOS_INLINE_FUNCTION
+long atomic_compare_exchange( volatile long * const dest, const long compare, const long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+// GCC supports unsigned
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_compare_exchange( volatile unsigned int * const dest, const unsigned int compare, const unsigned int val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long atomic_compare_exchange( volatile unsigned long * const dest ,
+                                       const unsigned long compare ,
+                                       const unsigned long val )
+{ return __sync_val_compare_and_swap(dest,compare,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+#else
+  union U {
+    int i ;
+    T t ;
+  } tmp ;
+#endif
+
+  tmp.i = __sync_val_compare_and_swap( (int*) dest , *((int*)&compare) , *((int*)&val) );
+  return tmp.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T & >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    long i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+#else
+  union U {
+    long i ;
+    T t ;
+  } tmp ;
+#endif
+
+  tmp.i = __sync_val_compare_and_swap( (long*) dest , *((long*)&compare) , *((long*)&val) );
+  return tmp.t ;
+}
+
+#ifdef KOKKOS_ENABLE_ASM
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T & compare,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(long) &&
+                                    sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } tmp ;
+
+  tmp.i = Impl::cas128( (Impl::cas128_t*) dest , *((Impl::cas128_t*)&compare) , *((Impl::cas128_t*)&val) );
+  return tmp.t ;
+}
+#endif
+
+template < typename T >
+inline
+T atomic_compare_exchange( volatile T * const dest , const T compare ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+            #if defined(KOKKOS_ENABLE_ASM)
+               && ( sizeof(T) != 16 )
+            #endif
+             , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  if( return_val == compare ) {
+    const T tmp = *dest = val;
+    #ifndef KOKKOS_COMPILER_CLANG
+    (void) tmp;
+    #endif
+  }
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_compare_exchange( volatile T * const dest, const T compare, const T val )
+{
+  T retval;
+#pragma omp critical
+  {
+    retval = dest[0];
+    if ( retval == compare )
+  	dest[0] = val;
+  }
+  return retval;
+}
+
+#endif
+
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+bool atomic_compare_exchange_strong(volatile T* const dest, const T compare, const T val)
+{
+  return compare == atomic_compare_exchange(dest, compare, val);
+}
+
+//----------------------------------------------------------------------------
+
+} // namespace Kokkos
+
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..1bdbdbc7f904e7ef284d818015b9c059033ca2a6
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp
@@ -0,0 +1,340 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP )
+#define KOKKOS_ATOMIC_EXCHANGE_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+__inline__ __device__
+int atomic_exchange( volatile int * const dest , const int val )
+{
+  // return __iAtomicExch( (int*) dest , val );
+  return atomicExch( (int*) dest , val );
+}
+
+__inline__ __device__
+unsigned int atomic_exchange( volatile unsigned int * const dest , const unsigned int val )
+{
+  // return __uAtomicExch( (unsigned int*) dest , val );
+  return atomicExch( (unsigned int*) dest , val );
+}
+
+__inline__ __device__
+unsigned long long int atomic_exchange( volatile unsigned long long int * const dest , const unsigned long long int val )
+{
+  // return __ullAtomicExch( (unsigned long long*) dest , val );
+  return atomicExch( (unsigned long long*) dest , val );
+}
+
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // int tmp = __ullAtomicExch( (int*) dest , *((int*)&val) );
+  int tmp = atomicExch( ((int*)dest) , *((int*)&val) );
+  return *((T*)&tmp);
+}
+
+template< typename T >
+__inline__ __device__
+T atomic_exchange(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // type tmp = __ullAtomicExch( (type*) dest , *((type*)&val) );
+  type tmp = atomicExch( ((type*)dest) , *((type*)&val) );
+  return *((T*)&tmp);
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_exchange( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      *dest = val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+}
+/** \brief  Atomic exchange for any type with compatible size */
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T & >::type val )
+{
+  // (void) __ullAtomicExch( (int*) dest , *((int*)&val) );
+  (void) atomicExch( ((int*)dest) , *((int*)&val) );
+}
+
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T & >::type val )
+{
+  typedef unsigned long long int type ;
+  // (void) __ullAtomicExch( (type*) dest , *((type*)&val) );
+  (void) atomicExch( ((type*)dest) , *((type*)&val) );
+}
+
+template< typename T >
+__inline__ __device__
+void atomic_assign(
+  volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(unsigned long long int)
+                                  , const T & >::type val )
+{
+  (void) atomic_exchange(dest,val);
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+
+  type assumed ;
+
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    T val_T ;
+    type val_type ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } old ;
+#else
+  union { T val_T ; type val_type ; } old ;
+#endif
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+
+  return old.val_T ;
+}
+
+#if defined(KOKKOS_ENABLE_ASM)
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
+                                  , const T & >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+  newval.t = val;
+
+  do {
+    assume.i = oldval.i ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_exchange( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM)
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  const T tmp = *dest = val;
+  #ifndef KOKKOS_COMPILER_CLANG
+  (void) tmp;
+  #endif
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) || sizeof(T) == sizeof(long)
+                                  , const T & >::type val )
+{
+  typedef typename Kokkos::Impl::if_c< sizeof(T) == sizeof(int) , int , long >::type type ;
+
+  const type v = *((type*)&val); // Extract to be sure the value doesn't change
+
+  type assumed ;
+
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    T val_T ;
+    type val_type ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } old ;
+#else
+  union { T val_T ; type val_type ; } old ;
+#endif
+
+  old.val_T = *dest ;
+
+  do {
+    assumed = old.val_type ;
+    old.val_type = __sync_val_compare_and_swap( (volatile type *) dest , assumed , v );
+  } while ( assumed != old.val_type );
+}
+
+#ifdef KOKKOS_ENABLE_ASM
+template< typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t)
+                                  , const T & >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+  newval.t = val;
+  do {
+    assume.i = oldval.i ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i);
+  } while ( assume.i != oldval.i );
+}
+#endif
+
+template < typename T >
+inline
+void atomic_assign( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM)
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  *dest = val;
+  Impl::unlock_address_host_space( (void*) dest );
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_exchange( volatile T * const dest , const T val )
+{
+  T retval;
+//#pragma omp atomic capture
+  #pragma omp critical
+  {
+    retval = dest[0];
+    dest[0] = val;
+  }
+  return retval;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+void atomic_assign( volatile T * const dest , const T val )
+{
+//#pragma omp atomic
+  #pragma omp critical
+  {
+    dest[0] = val;
+  }
+}
+
+#endif
+
+} // namespace Kokkos
+
+#endif
+
+//----------------------------------------------------------------------------
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..b06a5b424313d1b9a943de94b38d27f1158d74ca
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp
@@ -0,0 +1,326 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP )
+#define KOKKOS_ATOMIC_FETCH_ADD_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return atomicAdd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAdd((unsigned int*)dest,val); }
+
+__inline__ __device__
+unsigned long long int atomic_fetch_add( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAdd((unsigned long long int*)dest,val); }
+
+__inline__ __device__
+float atomic_fetch_add( volatile float * const dest , const float val )
+{ return atomicAdd((float*)dest,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    int i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
+  } while ( assumed.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    unsigned long long int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    unsigned long long int i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_add( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      *dest = return_val + val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_add( volatile int * const dest , const int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_add( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_add( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_add( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_add(dest,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    int i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    int i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T >::type val )
+{
+#ifdef KOKKOS_HAVE_CXX11
+  union U {
+    long i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+#else
+  union U {
+    long i ;
+    T t ;
+  } assume , oldval , newval ;
+#endif
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+#ifdef KOKKOS_ENABLE_ASM
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_add( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) != sizeof(long) &&
+                                    sizeof(T) == sizeof(Impl::cas128_t) , const T >::type val )
+{
+  union U {
+    Impl::cas128_t i ;
+    T t ;
+    KOKKOS_INLINE_FUNCTION U() {};
+  } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t + val ;
+    oldval.i = Impl::cas128( (volatile Impl::cas128_t*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_fetch_add( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+              #if defined(KOKKOS_ENABLE_ASM)
+               && ( sizeof(T) != 16 )
+              #endif
+                 , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  const T tmp = *dest = return_val + val;
+  #ifndef KOKKOS_COMPILER_CLANG
+  (void) tmp;
+  #endif
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_add( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] += val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_add without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_add(volatile T * const dest, const T src) {
+  atomic_fetch_add(dest,src);
+}
+
+// Atomic increment
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_increment(volatile T* a) {
+  Kokkos::atomic_fetch_add(a,1);
+}
+
+template<typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_decrement(volatile T* a) {
+  Kokkos::atomic_fetch_add(a,-1);
+}
+
+}
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..9b7ebae4ac6df12bae659e50aa7da34429ac3187
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_And.hpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_AND_HPP )
+#define KOKKOS_ATOMIC_FETCH_AND_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return atomicAnd((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicAnd((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_and( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicAnd((unsigned long long int*)dest,val); }
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_and( volatile int * const dest , const int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_and( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_and( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_and( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_and(dest,val); }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_and( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] &= val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_and without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_and(volatile T * const dest, const T src) {
+  (void)atomic_fetch_and(dest,src);
+}
+
+}
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..f15e61a3aea2ac2e7120d88a7151390cc2bf0b73
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Or.hpp
@@ -0,0 +1,125 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_OR_HPP )
+#define KOKKOS_ATOMIC_FETCH_OR_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return atomicOr((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicOr((unsigned int*)dest,val); }
+
+#if defined( __CUDA_ARCH__ ) && ( 350 <= __CUDA_ARCH__ )
+__inline__ __device__
+unsigned long long int atomic_fetch_or( volatile unsigned long long int * const dest ,
+                                         const unsigned long long int val )
+{ return atomicOr((unsigned long long int*)dest,val); }
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_or( volatile int * const dest , const int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_or( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_or( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_or( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_or(dest,val); }
+
+#endif
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_or( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] |= val;
+  }
+  return retval;
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+// Simpler version of atomic_fetch_or without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_or(volatile T * const dest, const T src) {
+  (void)atomic_fetch_or(dest,src);
+}
+
+}
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..259cba794ac6776f562260a6c3bb69a6afc67308
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp
@@ -0,0 +1,233 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP )
+#define KOKKOS_ATOMIC_FETCH_SUB_HPP
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+
+// Support for int, unsigned int, unsigned long long int, and float
+
+__inline__ __device__
+int atomic_fetch_sub( volatile int * const dest , const int val )
+{ return atomicSub((int*)dest,val); }
+
+__inline__ __device__
+unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
+{ return atomicSub((unsigned int*)dest,val); }
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
+  } while ( assumed.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = atomicCAS( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+__inline__ __device__
+T atomic_fetch_sub( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  T return_val;
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      return_val = *dest;
+      *dest = return_val - val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
+
+KOKKOS_INLINE_FUNCTION
+int atomic_fetch_sub( volatile int * const dest , const int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+long int atomic_fetch_sub( volatile long int * const dest , const long int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+#if defined( KOKKOS_ATOMICS_USE_GCC )
+
+KOKKOS_INLINE_FUNCTION
+unsigned int atomic_fetch_sub( volatile unsigned int * const dest , const unsigned int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+KOKKOS_INLINE_FUNCTION
+unsigned long int atomic_fetch_sub( volatile unsigned long int * const dest , const unsigned long int val )
+{ return __sync_fetch_and_sub(dest,val); }
+
+#endif
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = __sync_val_compare_and_swap( (int*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_sub( volatile T * const dest ,
+  typename Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(long) , const T >::type val )
+{
+  union { long i ; T t ; } assume , oldval , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = assume.t - val ;
+    oldval.i = __sync_val_compare_and_swap( (long*) dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+
+//----------------------------------------------------------------------------
+
+template < typename T >
+inline
+T atomic_fetch_sub( volatile T * const dest ,
+    typename ::Kokkos::Impl::enable_if<
+                  ( sizeof(T) != 4 )
+               && ( sizeof(T) != 8 )
+             , const T >::type& val )
+{
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  *dest = return_val - val;
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+}
+
+//----------------------------------------------------------------------------
+
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+
+template< typename T >
+T atomic_fetch_sub( volatile T * const dest , const T val )
+{
+  T retval;
+#pragma omp atomic capture
+  {
+    retval = dest[0];
+    dest[0] -= val;
+  }
+  return retval;
+}
+
+#endif
+
+// Simpler version of atomic_fetch_sub without the fetch
+template <typename T>
+KOKKOS_INLINE_FUNCTION
+void atomic_sub(volatile T * const dest, const T src) {
+  atomic_fetch_sub(dest,src);
+}
+
+}
+
+#include<impl/Kokkos_Atomic_Assembly_X86.hpp>
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..bd968633bb69a8aec9bf8650558c5b140b9c504f
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp
@@ -0,0 +1,375 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_GENERIC_HPP )
+#define KOKKOS_ATOMIC_GENERIC_HPP
+#include <Kokkos_Macros.hpp>
+
+// Combination operands to be used in an Compare and Exchange based atomic operation
+namespace Kokkos {
+namespace Impl {
+
+template<class Scalar1, class Scalar2>
+struct AddOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1+val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct SubOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1-val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct MulOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1*val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct DivOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1/val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct ModOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1%val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct AndOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1&val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct OrOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1|val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct XorOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1^val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct LShiftOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1<<val2;
+  }
+};
+
+template<class Scalar1, class Scalar2>
+struct RShiftOper {
+  KOKKOS_FORCEINLINE_FUNCTION
+  static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
+    return val1>>val2;
+  }
+};
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) != sizeof(int) &&
+                                    sizeof(T) == sizeof(unsigned long long int) , const T >::type val )
+{
+  union { unsigned long long int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (unsigned long long int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int) , const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return oldval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if< sizeof(T) == sizeof(int), const T >::type val )
+{
+  union { int i ; T t ; } oldval , assume , newval ;
+
+  oldval.t = *dest ;
+
+  do {
+    assume.i = oldval.i ;
+    newval.t = Oper::apply(assume.t, val) ;
+    oldval.i = ::Kokkos::atomic_compare_exchange( (int*)dest , assume.i , newval.i );
+  } while ( assume.i != oldval.i );
+
+  return newval.t ;
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_oper( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if<
+                ( sizeof(T) != 4 )
+             && ( sizeof(T) != 8 )
+          #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
+             && ( sizeof(T) != 16 )
+          #endif
+           , const T >::type val )
+{
+
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = *dest;
+  *dest = Oper::apply(return_val, val);
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+#else
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      T return_val = *dest;
+      *dest = Oper::apply(return_val, val);;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+#endif
+}
+
+template < class Oper, typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
+  typename ::Kokkos::Impl::enable_if<
+                ( sizeof(T) != 4 )
+             && ( sizeof(T) != 8 )
+          #if defined(KOKKOS_ENABLE_ASM) && defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST)
+             && ( sizeof(T) != 16 )
+          #endif
+           , const T >::type& val )
+{
+
+#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
+  while( !Impl::lock_address_host_space( (void*) dest ) );
+  T return_val = Oper::apply(*dest, val);
+  *dest = return_val;
+  Impl::unlock_address_host_space( (void*) dest );
+  return return_val;
+#else
+  // This is a way to (hopefully) avoid dead lock in a warp
+  bool done = false;
+  while (! done ) {
+    if( Impl::lock_address_cuda_space( (void*) dest ) ) {
+      T return_val = Oper::apply(*dest, val);
+      *dest = return_val;
+      Impl::unlock_address_cuda_space( (void*) dest );
+    }
+  }
+  return return_val;
+#endif
+}
+
+}
+}
+
+namespace Kokkos {
+
+// Fetch_Oper atomics: return value before operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mul(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::MulOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_div(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::DivOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_mod(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::ModOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_and(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::AndOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_or(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::OrOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_xor(volatile T * const dest, const T val) {
+  return Impl::atomic_fetch_oper(Impl::XorOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_lshift(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(Impl::LShiftOper<T,const unsigned int>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_fetch_oper(Impl::RShiftOper<T,const unsigned int>(),dest,val);
+}
+
+
+// Oper Fetch atomics: return value after operation
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_mul_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::MulOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_div_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::DivOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_mod_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::ModOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_and_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::AndOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_or_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::OrOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_xor_fetch(volatile T * const dest, const T val) {
+  return Impl::atomic_oper_fetch(Impl::XorOper<T,const T>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_lshift_fetch(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(Impl::LShiftOper<T,const unsigned int>(),dest,val);
+}
+
+template < typename T >
+KOKKOS_INLINE_FUNCTION
+T atomic_rshift_fetch(volatile T * const dest, const unsigned int val) {
+  return Impl::atomic_oper_fetch(Impl::RShiftOper<T,const unsigned int>(),dest,val);
+}
+
+
+}
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..f95ed67da97e3ada83dac18f8f3fc2dab04c7afb
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp
@@ -0,0 +1,462 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ATOMIC_VIEW_HPP
+#define KOKKOS_ATOMIC_VIEW_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_Atomic.hpp>
+
+namespace Kokkos { namespace Impl {
+
+class AllocationTracker;
+
+//The following tag is used to prevent an implicit call of the constructor when trying
+//to assign a literal 0 int ( = 0 );
+struct AtomicViewConstTag {};
+
+template<class ViewTraits>
+class AtomicDataElement {
+public:
+  typedef typename ViewTraits::value_type value_type;
+  typedef typename ViewTraits::const_value_type const_value_type;
+  typedef typename ViewTraits::non_const_value_type non_const_value_type;
+  volatile value_type* const ptr;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicDataElement(value_type* ptr_, AtomicViewConstTag ):ptr(ptr_){}
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator = (const_value_type& val) const {
+    *ptr = val;
+    return val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator = (volatile const_value_type& val) const {
+    *ptr = val;
+    return val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void inc() const {
+    Kokkos::atomic_increment(ptr);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void dec() const {
+    Kokkos::atomic_decrement(ptr);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ++ () const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,1);
+    return tmp+1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -- () const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-1);
+    return tmp-1;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ++ (int) const {
+    return Kokkos::atomic_fetch_add(ptr,1);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -- (int) const {
+    return Kokkos::atomic_fetch_add(ptr,-1);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator += (const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
+    return tmp+val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator += (volatile const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,val);
+    return tmp+val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -= (const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
+    return tmp-val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator -= (volatile const_value_type& val) const {
+    const_value_type tmp = Kokkos::atomic_fetch_add(ptr,-val);
+    return tmp-val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator *= (const_value_type& val) const {
+    return Kokkos::atomic_mul_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator *= (volatile const_value_type& val) const {
+    return Kokkos::atomic_mul_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator /= (const_value_type& val) const {
+    return Kokkos::atomic_div_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator /= (volatile const_value_type& val) const {
+    return Kokkos::atomic_div_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator %= (const_value_type& val) const {
+    return Kokkos::atomic_mod_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator %= (volatile const_value_type& val) const {
+    return Kokkos::atomic_mod_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator &= (const_value_type& val) const {
+    return Kokkos::atomic_and_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator &= (volatile const_value_type& val) const {
+    return Kokkos::atomic_and_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^= (const_value_type& val) const {
+    return Kokkos::atomic_xor_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^= (volatile const_value_type& val) const {
+    return Kokkos::atomic_xor_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator |= (const_value_type& val) const {
+    return Kokkos::atomic_or_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator |= (volatile const_value_type& val) const {
+    return Kokkos::atomic_or_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator <<= (const_value_type& val) const {
+    return Kokkos::atomic_lshift_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator <<= (volatile const_value_type& val) const {
+    return Kokkos::atomic_lshift_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >>= (const_value_type& val) const {
+    return Kokkos::atomic_rshift_fetch(ptr,val);
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >>= (volatile const_value_type& val) const {
+    return Kokkos::atomic_rshift_fetch(ptr,val);
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator + (const_value_type& val) const {
+    return *ptr+val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator + (volatile const_value_type& val) const {
+    return *ptr+val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator - (const_value_type& val) const {
+    return *ptr-val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator - (volatile const_value_type& val) const {
+    return *ptr-val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator * (const_value_type& val) const {
+    return *ptr*val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator * (volatile const_value_type& val) const {
+    return *ptr*val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator / (const_value_type& val) const {
+    return *ptr/val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator / (volatile const_value_type& val) const {
+    return *ptr/val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator % (const_value_type& val) const {
+    return *ptr^val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator % (volatile const_value_type& val) const {
+    return *ptr^val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ! () const {
+    return !*ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator && (const_value_type& val) const {
+    return *ptr&&val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator && (volatile const_value_type& val) const {
+    return *ptr&&val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator || (const_value_type& val) const {
+    return *ptr|val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator || (volatile const_value_type& val) const {
+    return *ptr|val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator & (const_value_type& val) const {
+    return *ptr&val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator & (volatile const_value_type& val) const {
+    return *ptr&val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator | (const_value_type& val) const {
+    return *ptr|val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator | (volatile const_value_type& val) const {
+    return *ptr|val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^ (const_value_type& val) const {
+    return *ptr^val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ^ (volatile const_value_type& val) const {
+    return *ptr^val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator ~ () const {
+    return ~*ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator << (const unsigned int& val) const {
+    return *ptr<<val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator << (volatile const unsigned int& val) const {
+    return *ptr<<val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >> (const unsigned int& val) const {
+    return *ptr>>val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  const_value_type operator >> (volatile const unsigned int& val) const {
+    return *ptr>>val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const_value_type& val) const {
+    return *ptr == val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (volatile const_value_type& val) const {
+    return *ptr == val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const_value_type& val) const {
+    return *ptr != val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (volatile const_value_type& val) const {
+    return *ptr != val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator >= (const_value_type& val) const {
+    return *ptr >= val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator >= (volatile const_value_type& val) const {
+    return *ptr >= val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator <= (const_value_type& val) const {
+    return *ptr <= val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator <= (volatile const_value_type& val) const {
+    return *ptr <= val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator < (const_value_type& val) const {
+    return *ptr < val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator < (volatile const_value_type& val) const {
+    return *ptr < val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator > (const_value_type& val) const {
+    return *ptr > val;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator > (volatile const_value_type& val) const {
+    return *ptr > val;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator const_value_type () const {
+    //return Kokkos::atomic_load(ptr);
+    return *ptr;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  operator volatile non_const_value_type () volatile const {
+    //return Kokkos::atomic_load(ptr);
+    return *ptr;
+  }
+};
+
+template<class ViewTraits>
+class AtomicViewDataHandle {
+public:
+  typename ViewTraits::value_type* ptr;
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicViewDataHandle()
+    : ptr(NULL)
+  {}
+
+  KOKKOS_INLINE_FUNCTION
+  AtomicViewDataHandle(typename ViewTraits::value_type* ptr_)
+    :ptr(ptr_)
+  {}
+
+  template<class iType>
+  KOKKOS_INLINE_FUNCTION
+  AtomicDataElement<ViewTraits> operator[] (const iType& i) const {
+    return AtomicDataElement<ViewTraits>(ptr+i,AtomicViewConstTag());
+  }
+
+
+  KOKKOS_INLINE_FUNCTION
+  operator typename ViewTraits::value_type * () const { return ptr ; }
+
+};
+
+template<unsigned Size>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars;
+
+template<>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<4> {
+  typedef int type;
+};
+
+template<>
+struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
+  typedef int64_t type;
+};
+
+// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
+template<class ViewTraits>
+class ViewDataHandle<
+  ViewTraits ,
+  typename enable_if<
+    ( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
+    ( ViewTraits::memory_traits::Atomic )
+  >::type >
+{
+private:
+//  typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) ||
+//                        (sizeof(typename ViewTraits::const_value_type)==8),
+//                         int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type
+//                   atomic_view_possible;
+  typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
+  typedef ViewDataHandle self_type;
+
+public:
+  enum {  ReturnTypeIsReference = false };
+
+  typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type;
+  typedef Impl::AtomicDataElement<ViewTraits>    return_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
+  {
+    return handle_type(arg_data_ptr);
+  }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..62581569fbfebedbcc577c29837233123a8ec8a3
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Windows.hpp
@@ -0,0 +1,211 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#ifndef KOKKOS_ATOMIC_WINDOWS_HPP
+#define KOKKOS_ATOMIC_WINDOWS_HPP
+#ifdef _WIN32
+
+#define NOMINMAX
+#include <Windows.h>
+
+namespace Kokkos {
+  namespace Impl {
+    _declspec(align(16))
+    struct cas128_t
+    {
+      LONGLONG lower;
+      LONGLONG upper;
+      KOKKOS_INLINE_FUNCTION
+        bool operator != (const cas128_t& a) const {
+        return (lower != a.lower) || upper != a.upper;
+      }
+    };
+  }
+
+#ifdef KOKKOS_HAVE_CXX11
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONG), const T & >::type val)
+  {
+    union U {
+      LONG i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp;
+
+    tmp.i = _InterlockedCompareExchange((LONG*)dest, *((LONG*)&val), *((LONG*)&compare));
+    return tmp.t;
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(LONGLONG), const T & >::type val)
+  {
+    union U {
+      LONGLONG i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp;
+
+    tmp.i = _InterlockedCompareExchange64((LONGLONG*)dest, *((LONGLONG*)&val), *((LONGLONG*)&compare));
+    return tmp.t;
+  }
+
+  template < typename T >
+  KOKKOS_INLINE_FUNCTION
+    T atomic_compare_exchange(volatile T * const dest, const T & compare,
+    typename Kokkos::Impl::enable_if< sizeof(T) == sizeof(Impl::cas128_t), const T & >::type val)
+  {
+    union U {
+      Impl::cas128_t i;
+      T t;
+      KOKKOS_INLINE_FUNCTION U() {};
+    } tmp, newval;
+    newval.t = val;
+    tmp.i = _InterlockedCompareExchange128((LONGLONG*)dest, newval.i.upper, newval.i.lower, *((LONGLONG*)&compare));
+    return tmp.t;
+  }
+
+  template< typename T >
+  T atomic_fetch_or(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val | oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_and(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val & oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_add(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = val + oldval;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  T atomic_fetch_exchange(volatile T * const dest, const T val) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      oldval = atomic_compare_exchange(dest, assume, val);
+    } while (assume != oldval);
+
+    return oldval;
+  }
+
+  template< typename T >
+  void atomic_or(volatile T * const dest, const T val) {
+    atomic_fetch_or(dest, val);
+  }
+
+  template< typename T >
+  void atomic_and(volatile T * const dest, const T val) {
+    atomic_fetch_and(dest, val);
+  }
+
+  template< typename T >
+  void atomic_add(volatile T * const dest, const T val) {
+    atomic_fetch_add(dest, val);
+  }
+
+  template< typename T >
+  void atomic_exchange(volatile T * const dest, const T val) {
+    atomic_fetch_exchange(dest, val);
+  }
+
+  template< typename T >
+  void atomic_assign(volatile T * const dest, const T val) {
+    atomic_fetch_exchange(dest, val);
+  }
+
+  template< typename T >
+  T atomic_increment(volatile T * const dest) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = assume++;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+  }
+
+  template< typename T >
+  T atomic_decrement(volatile T * const dest) {
+    T oldval = *dest;
+    T assume;
+    do {
+      assume = oldval;
+      T newval = assume--;
+      oldval = atomic_compare_exchange(dest, assume, newval);
+    } while (assume != oldval);
+  }
+
+}
+#endif
+#endif
+#endif
\ No newline at end of file
diff --git a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..8da619fdba9b58bf16f6f23bd1a148bdd224a28d
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.cpp
@@ -0,0 +1,281 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_HostSpace.hpp>
+
+#include <impl/Kokkos_BasicAllocators.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+
+#include <stdint.h>    // uintptr_t
+#include <cstdlib>     // for malloc, realloc, and free
+#include <cstring>     // for memcpy
+#include <sys/mman.h>  // for mmap, munmap, MAP_ANON, etc
+#include <unistd.h>    // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
+
+#include <sstream>
+
+namespace Kokkos { namespace Impl {
+
+/*--------------------------------------------------------------------------*/
+
+void* MallocAllocator::allocate( size_t size )
+{
+  void * ptr = NULL;
+  if (size) {
+    ptr = malloc(size);
+
+    if (!ptr)
+    {
+      std::ostringstream msg ;
+      msg << name() << ": allocate(" << size << ") FAILED";
+      throw_runtime_exception( msg.str() );
+    }
+  }
+  return ptr;
+}
+
+void MallocAllocator::deallocate( void * ptr, size_t /*size*/ )
+{
+  if (ptr) {
+    free(ptr);
+  }
+}
+
+void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
+{
+  void * ptr = realloc(old_ptr, new_size);
+
+  if (new_size > 0u && ptr == NULL) {
+    throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
+  }
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------*/
+
+namespace {
+
+void * raw_aligned_allocate( size_t size, size_t alignment )
+{
+  void * ptr = NULL;
+  if ( size ) {
+#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
+    ptr = _mm_malloc( size , alignment );
+
+#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
+    ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
+
+    posix_memalign( & ptr, alignment , size );
+
+#else
+    // Over-allocate to and round up to guarantee proper alignment.
+    size_t size_padded = size + alignment + sizeof(void *);
+    void * alloc_ptr = malloc( size_padded );
+
+    if (alloc_ptr) {
+      uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
+      // offset enough to record the alloc_ptr
+      address += sizeof(void *);
+      uintptr_t rem = address % alignment;
+      uintptr_t offset = rem ? (alignment - rem) : 0u;
+      address += offset;
+      ptr = reinterpret_cast<void *>(address);
+      // record the alloc'd pointer
+      address -= sizeof(void *);
+      *reinterpret_cast<void **>(address) = alloc_ptr;
+    }
+#endif
+  }
+  return ptr;
+}
+
+void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
+{
+  if ( ptr ) {
+#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
+    _mm_free( ptr );
+
+#elif ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
+      ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
+    free( ptr );
+#else
+    // get the alloc'd pointer
+    void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1);
+    free( alloc_ptr );
+#endif
+  }
+
+}
+
+}
+
+void* AlignedAllocator::allocate( size_t size )
+{
+  void * ptr = 0 ;
+
+  if ( size ) {
+    ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT);
+
+    if (!ptr)
+    {
+      std::ostringstream msg ;
+      msg << name() << ": allocate(" << size << ") FAILED";
+      throw_runtime_exception( msg.str() );
+    }
+  }
+  return ptr;
+}
+
+void AlignedAllocator::deallocate( void * ptr, size_t size )
+{
+  raw_aligned_deallocate( ptr, size);
+}
+
+void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  void * ptr = old_ptr;;
+
+  if (old_size < new_size) {
+    ptr = allocate( new_size );
+
+    memcpy(ptr, old_ptr, old_size );
+
+    deallocate( old_ptr, old_size );
+  }
+
+  return ptr;
+}
+
+/*--------------------------------------------------------------------------*/
+
+// mmap flags for private anonymous memory allocation
+#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
+  #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
+#elif defined( MAP_ANON) && defined( MAP_PRIVATE )
+  #define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
+#else
+  #define NO_MMAP
+#endif
+
+// huge page tables
+#if !defined( NO_MMAP )
+  #if defined( MAP_HUGETLB )
+    #define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB )
+  #elif defined( MMAP_FLAGS )
+    #define MMAP_FLAGS_HUGE MMAP_FLAGS
+  #endif
+  // threshold to use huge pages
+  #define MMAP_USE_HUGE_PAGES (1u << 27)
+#endif
+
+// read write access to private memory
+#if !defined( NO_MMAP )
+  #define MMAP_PROTECTION (PROT_READ | PROT_WRITE)
+#endif
+
+
+void* PageAlignedAllocator::allocate( size_t size )
+{
+  void *ptr = NULL;
+  if (size) {
+#if !defined NO_MMAP
+    if ( size < MMAP_USE_HUGE_PAGES ) {
+      ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/);
+    } else {
+      ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/);
+    }
+    if (ptr == MAP_FAILED) {
+      ptr = NULL;
+    }
+#else
+    static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE )
+
+    ptr = raw_aligned_allocate( size, page_size);
+#endif
+    if (!ptr)
+    {
+      std::ostringstream msg ;
+      msg << name() << ": allocate(" << size << ") FAILED";
+      throw_runtime_exception( msg.str() );
+    }
+  }
+  return ptr;
+}
+
+void PageAlignedAllocator::deallocate( void * ptr, size_t size )
+{
+#if !defined( NO_MMAP )
+  munmap(ptr, size);
+#else
+  raw_aligned_deallocate(ptr, size);
+#endif
+}
+
+void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
+{
+  void * ptr = NULL;
+#if defined( NO_MMAP ) || defined( __APPLE__ )
+
+  if (old_size != new_size) {
+    ptr = allocate( new_size );
+
+    memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) );
+
+    deallocate( old_ptr, old_size );
+  }
+  else {
+    ptr = old_ptr;
+  }
+#else
+  ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE );
+
+  if (ptr == MAP_FAILED) {
+    throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory");
+  }
+#endif
+
+  return ptr;
+}
+
+}} // namespace Kokkos::Impl
diff --git a/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..76377c5f159abe88272a2a73794bf899a4427aee
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_BasicAllocators.hpp
@@ -0,0 +1,118 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_BASIC_ALLOCATORS_HPP
+#define KOKKOS_BASIC_ALLOCATORS_HPP
+
+
+namespace Kokkos { namespace Impl {
+
+/// class UnmanagedAllocator
+/// does nothing when deallocate(ptr,size) is called
+class UnmanagedAllocator
+{
+public:
+  static const char * name() { return "Unmanaged Allocator"; }
+
+  static void deallocate(void * /*ptr*/, size_t /*size*/) {}
+};
+
+
+/// class MallocAllocator
+class MallocAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Malloc Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t size);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+/// class AlignedAllocator
+/// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT
+class AlignedAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Aligned Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t size);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+/// class PageAlignedAllocator
+/// memory aligned to PAGE_SIZE
+class PageAlignedAllocator
+{
+public:
+  static const char * name()
+  {
+    return "Page Aligned Allocator";
+  }
+
+  static void* allocate(size_t size);
+
+  static void deallocate(void * ptr, size_t size);
+
+  static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
+};
+
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_BASIC_ALLOCATORS_HPP
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..1c3c83cfe7c12c95889cee98c9be2c2bbc896f38
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp
@@ -0,0 +1,447 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <cctype>
+#include <cstring>
+#include <iostream>
+#include <cstdlib>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+namespace {
+
+bool is_unsigned_int(const char* str)
+{
+  const size_t len = strlen (str);
+  for (size_t i = 0; i < len; ++i) {
+    if (! isdigit (str[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+void initialize_internal(const InitArguments& args)
+{
+  // Protect declarations, to prevent "unused variable" warnings.
+#if defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
+  const int num_threads = args.num_threads;
+  const int use_numa = args.num_numa;
+#endif // defined( KOKKOS_HAVE_OPENMP ) || defined( KOKKOS_HAVE_PTHREAD )
+#if defined( KOKKOS_HAVE_CUDA )
+  const int use_gpu = args.device_id;
+#endif // defined( KOKKOS_HAVE_CUDA )
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
+    if(num_threads>0) {
+      if(use_numa>0) {
+        Kokkos::OpenMP::initialize(num_threads,use_numa);
+      }
+      else {
+        Kokkos::OpenMP::initialize(num_threads);
+      }
+    } else {
+      Kokkos::OpenMP::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled and initialized" << std::endl ;
+  }
+  else {
+    //std::cout << "Kokkos::initialize() fyi: OpenMP enabled but not initialized" << std::endl ;
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
+    if(num_threads>0) {
+      if(use_numa>0) {
+        Kokkos::Threads::initialize(num_threads,use_numa);
+      }
+      else {
+        Kokkos::Threads::initialize(num_threads);
+      }
+    } else {
+      Kokkos::Threads::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: Pthread enabled and initialized" << std::endl ;
+  }
+  else {
+    //std::cout << "Kokkos::initialize() fyi: Pthread enabled but not initialized" << std::endl ;
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+  // Prevent "unused variable" warning for 'args' input struct.  If
+  // Serial::initialize() ever needs to take arguments from the input
+  // struct, you may remove this line of code.
+  (void) args;
+
+  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Serial::initialize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_CUDA )
+  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || 0 < use_gpu ) {
+    if (use_gpu > -1) {
+      Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice( use_gpu ) );
+    }
+    else {
+      Kokkos::Cuda::initialize();
+    }
+    //std::cout << "Kokkos::initialize() fyi: Cuda enabled and initialized" << std::endl ;
+  }
+#endif
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+    Kokkos::Experimental::initialize();
+#endif
+}
+
+void finalize_internal( const bool all_spaces = false )
+{
+
+#if defined( KOKKOS_HAVE_CUDA )
+  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value || all_spaces ) {
+    if(Kokkos::Cuda::is_initialized())
+      Kokkos::Cuda::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::OpenMP::is_initialized())
+      Kokkos::OpenMP::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::Threads::is_initialized())
+      Kokkos::Threads::finalize();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ||
+      all_spaces ) {
+    if(Kokkos::Serial::is_initialized())
+      Kokkos::Serial::finalize();
+  }
+#endif
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+    Kokkos::Experimental::finalize();
+#endif
+
+}
+
+void fence_internal()
+{
+
+#if defined( KOKKOS_HAVE_CUDA )
+  if( Impl::is_same< Kokkos::Cuda , Kokkos::DefaultExecutionSpace >::value ) {
+    Kokkos::Cuda::fence();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_OPENMP )
+  if( Impl::is_same< Kokkos::OpenMP , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::OpenMP , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::OpenMP::fence();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+  if( Impl::is_same< Kokkos::Threads , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Threads , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Threads::fence();
+  }
+#endif
+
+#if defined( KOKKOS_HAVE_SERIAL )
+  if( Impl::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value ||
+      Impl::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) {
+    Kokkos::Serial::fence();
+  }
+#endif
+
+}
+
+} // namespace
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+void initialize(int& narg, char* arg[])
+{
+    int num_threads = -1;
+    int numa = -1;
+    int device = -1;
+
+    int kokkos_threads_found = 0;
+    int kokkos_numa_found = 0;
+    int kokkos_device_found = 0;
+    int kokkos_ndevices_found = 0;
+
+    int iarg = 0;
+
+    while (iarg < narg) {
+      if ((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || (strncmp(arg[iarg],"--threads",9) == 0)) {
+        //Find the number of threads (expecting --threads=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-threads=",17) == 0) || (strncmp(arg[iarg],"--threads=",10) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        char* number =  strchr(arg[iarg],'=')+1;
+
+        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--threads/--kokkos-threads'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        if((strncmp(arg[iarg],"--kokkos-threads",16) == 0) || !kokkos_threads_found)
+          num_threads = atoi(number);
+
+        //Remove the --kokkos-threads argument from the list but leave --threads
+        if(strncmp(arg[iarg],"--kokkos-threads",16) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_threads_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || (strncmp(arg[iarg],"--numa",6) == 0)) {
+        //Find the number of numa (expecting --numa=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-numa=",14) == 0) || (strncmp(arg[iarg],"--numa=",7) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        char* number =  strchr(arg[iarg],'=')+1;
+
+        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--numa/--kokkos-numa'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        if((strncmp(arg[iarg],"--kokkos-numa",13) == 0) || !kokkos_numa_found)
+          numa = atoi(number);
+
+        //Remove the --kokkos-numa argument from the list but leave --numa
+        if(strncmp(arg[iarg],"--kokkos-numa",13) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_numa_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strncmp(arg[iarg],"--kokkos-device",15) == 0) || (strncmp(arg[iarg],"--device",8) == 0)) {
+        //Find the number of device (expecting --device=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-device=",16) == 0) || (strncmp(arg[iarg],"--device=",9) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        char* number =  strchr(arg[iarg],'=')+1;
+
+        if(!Impl::is_unsigned_int(number) || (strlen(number)==0))
+          Impl::throw_runtime_exception("Error: expecting an '=INT' after command line argument '--device/--kokkos-device'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        if((strncmp(arg[iarg],"--kokkos-device",15) == 0) || !kokkos_device_found)
+          device = atoi(number);
+
+        //Remove the --kokkos-device argument from the list but leave --device
+        if(strncmp(arg[iarg],"--kokkos-device",15) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_device_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || (strncmp(arg[iarg],"--ndevices",10) == 0)) {
+
+        //Find the number of device (expecting --device=XX)
+        if (!((strncmp(arg[iarg],"--kokkos-ndevices=",18) == 0) || (strncmp(arg[iarg],"--ndevices=",11) == 0)))
+          Impl::throw_runtime_exception("Error: expecting an '=INT[,INT]' after command line argument '--ndevices/--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+        int ndevices=-1;
+        int skip_device = 9999;
+
+        char* num1 = strchr(arg[iarg],'=')+1;
+        char* num2 = strpbrk(num1,",");
+        int num1_len = num2==NULL?strlen(num1):num2-num1;
+        char* num1_only = new char[num1_len+1];
+        strncpy(num1_only,num1,num1_len);
+        num1_only[num1_len]=0;
+
+        if(!Impl::is_unsigned_int(num1_only) || (strlen(num1_only)==0)) {
+          Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+        }
+        if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
+          ndevices = atoi(num1_only);
+
+        if( num2 != NULL ) {
+          if(( !Impl::is_unsigned_int(num2+1) ) || (strlen(num2)==1) )
+            Impl::throw_runtime_exception("Error: expecting an integer number after command line argument '--kokkos-ndevices=XX,'. Raised by Kokkos::initialize(int narg, char* argc[]).");
+
+          if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found)
+            skip_device = atoi(num2+1);
+        }
+
+        if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found) {
+          char *str;
+          if ((str = getenv("SLURM_LOCALID"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if ((str = getenv("MV2_COMM_WORLD_LOCAL_RANK"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK"))) {
+            int local_rank = atoi(str);
+            device = local_rank % ndevices;
+            if (device >= skip_device) device++;
+          }
+          if(device==-1) {
+            device = 0;
+            if (device >= skip_device) device++;
+          }
+        }
+
+        //Remove the --kokkos-ndevices argument from the list but leave --ndevices
+        if(strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) {
+          for(int k=iarg;k<narg-1;k++) {
+            arg[k] = arg[k+1];
+          }
+          kokkos_ndevices_found=1;
+          narg--;
+        } else {
+          iarg++;
+        }
+      } else if ((strcmp(arg[iarg],"--kokkos-help") == 0) || (strcmp(arg[iarg],"--help") == 0)) {
+         std::cout << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << "-------------Kokkos command line arguments--------------------------------------" << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << "The following arguments exist also without prefix 'kokkos' (e.g. --help)." << std::endl;
+         std::cout << "The prefixed arguments will be removed from the list by Kokkos::initialize()," << std::endl;
+         std::cout << "the non-prefixed ones are not removed. Prefixed versions take precedence over " << std::endl;
+         std::cout << "non prefixed ones, and the last occurence of an argument overwrites prior" << std::endl;
+         std::cout << "settings." << std::endl;
+         std::cout << std::endl;
+         std::cout << "--kokkos-help               : print this message" << std::endl;
+         std::cout << "--kokkos-threads=INT        : specify total number of threads or" << std::endl;
+         std::cout << "                              number of threads per NUMA region if " << std::endl;
+         std::cout << "                              used in conjunction with '--numa' option. " << std::endl;
+         std::cout << "--kokkos-numa=INT           : specify number of NUMA regions used by process." << std::endl;
+         std::cout << "--kokkos-device=INT         : specify device id to be used by Kokkos. " << std::endl;
+         std::cout << "--kokkos-ndevices=INT[,INT] : used when running MPI jobs. Specify number of" << std::endl;
+         std::cout << "                              devices per node to be used. Process to device" << std::endl;
+         std::cout << "                              mapping happens by obtaining the local MPI rank" << std::endl;
+         std::cout << "                              and assigning devices round-robin. The optional" << std::endl;
+         std::cout << "                              second argument allows for an existing device" << std::endl;
+         std::cout << "                              to be ignored. This is most useful on workstations" << std::endl;
+         std::cout << "                              with multiple GPUs of which one is used to drive" << std::endl;
+         std::cout << "                              screen output." << std::endl;
+         std::cout << std::endl;
+         std::cout << "--------------------------------------------------------------------------------" << std::endl;
+         std::cout << std::endl;
+
+         //Remove the --kokkos-help argument from the list but leave --ndevices
+         if(strcmp(arg[iarg],"--kokkos-help") == 0) {
+           for(int k=iarg;k<narg-1;k++) {
+             arg[k] = arg[k+1];
+           }
+           narg--;
+         } else {
+           iarg++;
+         }
+      } else
+      iarg++;
+    }
+
+    InitArguments arguments;
+    arguments.num_threads = num_threads;
+    arguments.num_numa = numa;
+    arguments.device_id = device;
+    Impl::initialize_internal(arguments);
+}
+
+void initialize(const InitArguments& arguments) {
+  Impl::initialize_internal(arguments);
+}
+
+void finalize()
+{
+  Impl::finalize_internal();
+}
+
+void finalize_all()
+{
+  enum { all_spaces = true };
+  Impl::finalize_internal( all_spaces );
+}
+
+void fence()
+{
+  Impl::fence_internal();
+}
+
+} // namespace Kokkos
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.cpp b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..97cfbfae7e82422f6795fd0228ccb993580afb89
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.cpp
@@ -0,0 +1,193 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include <ostream>
+#include <sstream>
+#include <iomanip>
+#include <stdexcept>
+#include <impl/Kokkos_Error.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void host_abort( const char * const message )
+{
+  fwrite(message,1,strlen(message),stderr);
+  fflush(stderr);
+  abort();
+}
+
+void throw_runtime_exception( const std::string & msg )
+{
+  std::ostringstream o ;
+  o << msg ;
+  traceback_callstack( o );
+  throw std::runtime_error( o.str() );
+}
+
+
+std::string human_memory_size(size_t arg_bytes)
+{
+  double bytes = arg_bytes;
+  const double K = 1024;
+  const double M = K*1024;
+  const double G = M*1024;
+
+  std::ostringstream out;
+  if (bytes < K) {
+    out << std::setprecision(4) << bytes << " B";
+  } else if (bytes < M) {
+    bytes /= K;
+    out << std::setprecision(4) << bytes << " K";
+  } else if (bytes < G) {
+    bytes /= M;
+    out << std::setprecision(4) << bytes << " M";
+  } else {
+    bytes /= G;
+    out << std::setprecision(4) << bytes << " G";
+  }
+  return out.str();
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( __GNUC__ ) && defined( ENABLE_TRACEBACK )
+
+/*  This is only known to work with GNU C++
+ *  Must be compiled with '-rdynamic'
+ *  Must be linked with   '-ldl'
+ */
+
+/* Print call stack into an error stream,
+ * so one knows in which function the error occured.
+ *
+ * Code copied from:
+ *   http://stupefydeveloper.blogspot.com/2008/10/cc-call-stack.html
+ *
+ * License on this site:
+ *   This blog is licensed under a
+ *   Creative Commons Attribution-Share Alike 3.0 Unported License.
+ *
+ *   http://creativecommons.org/licenses/by-sa/3.0/
+ *
+ * Modified to output to std::ostream.
+ */
+#include <signal.h>
+#include <execinfo.h>
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <stdlib.h>
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  using namespace abi;
+
+  enum { MAX_DEPTH = 32 };
+
+  void *trace[MAX_DEPTH];
+  Dl_info dlinfo;
+
+  int status;
+
+  int trace_size = backtrace(trace, MAX_DEPTH);
+
+  msg << std::endl << "Call stack {" << std::endl ;
+
+  for (int i=1; i<trace_size; ++i)
+  {
+    if(!dladdr(trace[i], &dlinfo))
+        continue;
+
+    const char * symname = dlinfo.dli_sname;
+
+    char * demangled = __cxa_demangle(symname, NULL, 0, &status);
+
+    if ( status == 0 && demangled ) {
+      symname = demangled;
+    }
+
+    if ( symname && *symname != 0 ) {
+      msg << "  object: " << dlinfo.dli_fname
+          << " function: " << symname
+          << std::endl ;
+    }
+
+    if ( demangled ) {
+        free(demangled);
+    }
+  }
+  msg << "}" ;
+}
+
+}
+}
+
+#else
+
+namespace Kokkos {
+namespace Impl {
+
+void traceback_callstack( std::ostream & msg )
+{
+  msg << std::endl << "Traceback functionality not available" << std::endl ;
+}
+
+}
+}
+
+#endif
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Error.hpp b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..33e203c948b23cc511205f529d6114d88f31307e
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Error.hpp
@@ -0,0 +1,78 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPL_ERROR_HPP
+#define KOKKOS_IMPL_ERROR_HPP
+
+#include <string>
+#include <iosfwd>
+
+namespace Kokkos {
+namespace Impl {
+
+void host_abort( const char * const );
+
+void throw_runtime_exception( const std::string & );
+
+void traceback_callstack( std::ostream & );
+
+std::string human_memory_size(size_t arg_bytes);
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+namespace Kokkos {
+inline
+void abort( const char * const message ) { Kokkos::Impl::host_abort(message); }
+}
+#endif /* defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_IMPL_ERROR_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..ff6230b57c8abbd778059e55aa2a019d6bee70e2
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAdapter.hpp
@@ -0,0 +1,1070 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_FUNCTORADAPTER_HPP
+#define KOKKOS_FUNCTORADAPTER_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Tags.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ArgTag , class Enable = void >
+struct FunctorDeclaresValueType : public Impl::false_type {};
+
+template< class FunctorType , class ArgTag >
+struct FunctorDeclaresValueType< FunctorType , ArgTag
+                               , typename Impl::enable_if_type< typename FunctorType::value_type >::type >
+  : public Impl::true_type {};
+
+
+/** \brief  Query Functor and execution policy argument tag for value type.
+ *
+ *  If C++11 enabled and 'value_type' is not explicitly declared then attempt
+ *  to deduce the type from FunctorType::operator().
+ */
+template< class FunctorType , class ArgTag , bool Dec = FunctorDeclaresValueType<FunctorType,ArgTag>::value >
+struct FunctorValueTraits
+{
+  typedef void value_type ;
+  typedef void pointer_type ;
+  typedef void reference_type ;
+
+  enum { StaticValueSize = 0 };
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const FunctorType & ) { return 0 ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const FunctorType & ) { return 0 ; }
+};
+
+template<class ArgTag>
+struct FunctorValueTraits<void, ArgTag,false>
+{
+  typedef void reference_type;
+};
+
+/** \brief  FunctorType::value_type is explicitly declared so use it.
+ *
+ * Two options for declaration
+ *
+ *   1) A plain-old-data (POD) type
+ *        typedef {pod_type} value_type ;
+ *
+ *   2) An array of POD of a runtime specified count.
+ *        typedef {pod_type} value_type[] ;
+ *        const unsigned     value_count ;
+ */
+template< class FunctorType , class ArgTag >
+struct FunctorValueTraits< FunctorType , ArgTag , true /* == exists FunctorType::value_type */ >
+{
+  typedef typename Impl::remove_extent< typename FunctorType::value_type >::type  value_type ;
+
+  // If not an array then what is the sizeof(value_type)
+  enum { StaticValueSize = Impl::is_array< typename FunctorType::value_type >::value ? 0 : sizeof(value_type) };
+
+  typedef value_type                 * pointer_type ;
+
+  // The reference_type for an array is 'value_type *'
+  // The reference_type for a single value is 'value_type &'
+
+  typedef typename Impl::if_c< ! StaticValueSize , value_type *
+                                                 , value_type & >::type  reference_type ;
+
+  // Number of values if single value
+  template< class F >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  typename Impl::enable_if< Impl::is_same<F,FunctorType>::value && StaticValueSize , unsigned >::type
+    value_count( const F & ) { return 1 ; }
+
+  // Number of values if an array, protect via templating because 'f.value_count'
+  // will only exist when the functor declares the value_type to be an array.
+  template< class F >
+  KOKKOS_FORCEINLINE_FUNCTION static
+  typename Impl::enable_if< Impl::is_same<F,FunctorType>::value && ! StaticValueSize , unsigned >::type
+    value_count( const F & f ) { return f.value_count ; }
+
+  // Total size of the value
+  KOKKOS_INLINE_FUNCTION static
+  unsigned value_size( const FunctorType & f ) { return value_count( f ) * sizeof(value_type) ; }
+};
+
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+template< class FunctorType , class ArgTag >
+struct FunctorValueTraits< FunctorType
+                         , ArgTag
+                         , false  /* == exists FunctorType::value_type */
+                         >
+{
+private:
+
+  struct VOIDTAG {};   // Allow declaration of non-matching operator() with void argument tag.
+  struct REJECTTAG {}; // Reject tagged operator() when using non-tagged execution policy.
+
+  typedef typename
+    Impl::if_c< Impl::is_same< ArgTag , void >::value , VOIDTAG , ArgTag >::type tag_type ;
+
+  //----------------------------------------
+  // parallel_for operator without a tag:
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember ) const ) {}
+
+  template< class TagType , class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & ) const ) {}
+
+  //----------------------------------------
+  // parallel_for operator with a tag:
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & ) const ) {}
+
+  template< class ArgMember >
+  KOKKOS_INLINE_FUNCTION
+  static VOIDTAG deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & ) const ) {}
+
+  //----------------------------------------
+  // parallel_reduce operator without a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , T & ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , T & ) const ) {}
+
+  //----------------------------------------
+  // parallel_reduce operator with a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember & , T & ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember & , T & ) const ) {}
+
+  //----------------------------------------
+  // parallel_scan operator without a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( ArgMember , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const ArgMember & , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , ArgMember , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( TagType , const ArgMember & , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , ArgMember , T & , bool ) const ) {}
+
+  template< class TagType , class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static REJECTTAG deduce_reduce_type( VOIDTAG , void (FunctorType::*)( const TagType & , const ArgMember & , T & , bool ) const ) {}
+
+  //----------------------------------------
+  // parallel_scan operator with a tag:
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , ArgMember , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , ArgMember , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( tag_type , const ArgMember& , T & , bool ) const ) {}
+
+  template< class ArgMember , class T >
+  KOKKOS_INLINE_FUNCTION
+  static T deduce_reduce_type( tag_type , void (FunctorType::*)( const tag_type & , const ArgMember& , T & , bool ) const ) {}
+
+  //----------------------------------------
+
+  typedef decltype( deduce_reduce_type( tag_type() , & FunctorType::operator() ) ) ValueType ;
+
+  enum { IS_VOID   = Impl::is_same<VOIDTAG  ,ValueType>::value };
+  enum { IS_REJECT = Impl::is_same<REJECTTAG,ValueType>::value };
+
+public:
+
+  typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType   >::type  value_type ;
+  typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType * >::type  pointer_type ;
+  typedef typename Impl::if_c< IS_VOID || IS_REJECT , void , ValueType & >::type  reference_type ;
+
+  enum { StaticValueSize = IS_VOID || IS_REJECT ? 0 : sizeof(ValueType) };
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_size( const FunctorType & ) { return StaticValueSize ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  unsigned value_count( const FunctorType & ) { return IS_VOID || IS_REJECT ? 0 : 1 ; }
+};
+
+#endif /* #if defined( KOKKOS_HAVE_CXX11 ) */
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Function signatures for FunctorType::init function with a tag and not an array
+template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize >
+struct FunctorValueInitFunction {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type & ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile & ) );
+};
+
+// Function signatures for FunctorType::init function with a tag and is an array
+template< class FunctorType , class ArgTag >
+struct FunctorValueInitFunction< FunctorType , ArgTag , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type * ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile * ) );
+};
+
+// Function signatures for FunctorType::init function without a tag and not an array
+template< class FunctorType >
+struct FunctorValueInitFunction< FunctorType , void , false > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::reference_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type & ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type volatile & ) );
+};
+
+// Function signatures for FunctorType::init function without a tag and is an array
+template< class FunctorType >
+struct FunctorValueInitFunction< FunctorType , void , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::reference_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type * ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type volatile * ) );
+};
+
+// Adapter for value initialization function.
+// If a proper FunctorType::init is declared then use it,
+// otherwise use default constructor.
+template< class FunctorType , class ArgTag
+        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
+        , class Enable = void >
+struct FunctorValueInit ;
+
+/* No 'init' function provided for single value */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueInit< FunctorType , ArgTag , T & , Enable >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & init( const FunctorType & f , void * p )
+    { return *( new(p) T() ); };
+};
+
+/* No 'init' function provided for array value */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueInit< FunctorType , ArgTag , T * , Enable >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * init( const FunctorType & f , void * p )
+    {
+      const int n = FunctorValueTraits< FunctorType , ArgTag >::value_count(f);
+      for ( int i = 0 ; i < n ; ++i ) { new( ((T*)p) + i ) T(); }
+      return (T*)p ;
+    }
+};
+
+/* 'init' function provided for single value */
+template< class FunctorType , class T >
+struct FunctorValueInit
+  < FunctorType
+  , void
+  , T &
+    // First  substitution failure when FunctorType::init does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when FunctorType::init is not compatible.
+  , decltype( FunctorValueInitFunction< FunctorType , void >::enable_if( & FunctorType::init ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & init( const FunctorType & f , void * p )
+    { f.init( *((T*)p) ); return *((T*)p) ; }
+};
+
+/* 'init' function provided for array value */
+template< class FunctorType , class T >
+struct FunctorValueInit
+  < FunctorType
+  , void
+  , T *
+    // First  substitution failure when FunctorType::init does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when FunctorType::init is not compatible
+  , decltype( FunctorValueInitFunction< FunctorType , void >::enable_if( & FunctorType::init ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * init( const FunctorType & f , void * p )
+    { f.init( (T*)p ); return (T*)p ; }
+};
+
+/* 'init' function provided for single value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueInit
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::init does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when FunctorType::init is not compatible.
+  , decltype( FunctorValueInitFunction< FunctorType , ArgTag >::enable_if( & FunctorType::init ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & init( const FunctorType & f , void * p )
+    { f.init( ArgTag() , *((T*)p) ); return *((T*)p) ; }
+};
+
+/* 'init' function provided for array value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueInit
+  < FunctorType
+  , ArgTag
+  , T *
+    // First  substitution failure when FunctorType::init does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when FunctorType::init is not compatible
+  , decltype( FunctorValueInitFunction< FunctorType , ArgTag >::enable_if( & FunctorType::init ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::init ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * init( const FunctorType & f , void * p )
+    { f.init( ArgTag() , (T*)p ); return (T*)p ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Signatures for compatible FunctorType::join with tag and not an array
+template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize >
+struct FunctorValueJoinFunction {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  typedef       volatile value_type & vref_type ;
+  typedef const volatile value_type & cvref_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , vref_type , cvref_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , vref_type , cvref_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , vref_type , cvref_type ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , vref_type , cvref_type ) );
+};
+
+// Signatures for compatible FunctorType::join with tag and is an array
+template< class FunctorType , class ArgTag >
+struct FunctorValueJoinFunction< FunctorType , ArgTag , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  typedef       volatile value_type * vptr_type ;
+  typedef const volatile value_type * cvptr_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , vptr_type , cvptr_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , vptr_type , cvptr_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , vptr_type , cvptr_type ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , vptr_type , cvptr_type ) );
+};
+
+// Signatures for compatible FunctorType::join without tag and not an array
+template< class FunctorType >
+struct FunctorValueJoinFunction< FunctorType , void , false > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  typedef       volatile value_type & vref_type ;
+  typedef const volatile value_type & cvref_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( vref_type , cvref_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( vref_type , cvref_type ) );
+};
+
+// Signatures for compatible FunctorType::join without tag and is an array
+template< class FunctorType >
+struct FunctorValueJoinFunction< FunctorType , void , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  typedef       volatile value_type * vptr_type ;
+  typedef const volatile value_type * cvptr_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( vptr_type , cvptr_type ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( vptr_type , cvptr_type ) );
+};
+
+
+template< class FunctorType , class ArgTag
+        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
+        , class Enable = void >
+struct FunctorValueJoin ;
+
+/* No 'join' function provided, single value */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueJoin< FunctorType , ArgTag , T & , Enable >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+    {
+      *((volatile T*)lhs) += *((const volatile T*)rhs);
+    }
+};
+
+/* No 'join' function provided, array of values */
+template< class FunctorType , class ArgTag , class T , class Enable >
+struct FunctorValueJoin< FunctorType , ArgTag , T * , Enable >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+
+      for ( int i = 0 ; i < n ; ++i ) { ((volatile T*)lhs)[i] += ((const volatile T*)rhs)[i]; }
+    }
+};
+
+/* 'join' function provided, single value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::join does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f.join( ArgTag() , *((volatile T *)lhs) , *((const volatile T *)rhs) );
+    }
+};
+
+/* 'join' function provided, no tag, single value */
+template< class FunctorType , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , void
+  , T &
+    // First  substitution failure when FunctorType::join does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f.join( *((volatile T *)lhs) , *((const volatile T *)rhs) );
+    }
+};
+
+/* 'join' function provided for array value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , ArgTag
+  , T *
+    // First  substitution failure when FunctorType::join does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , ArgTag >::enable_if( & FunctorType::join ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f.join( ArgTag() , (volatile T *)lhs , (const volatile T *)rhs );
+    }
+};
+
+/* 'join' function provided, no tag, array value */
+template< class FunctorType , class T >
+struct FunctorValueJoin
+  < FunctorType
+  , void
+  , T *
+    // First  substitution failure when FunctorType::join does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when enable_if( & Functor::join ) does not exist
+  , decltype( FunctorValueJoinFunction< FunctorType , void >::enable_if( & FunctorType::join ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::join ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void join( const FunctorType & f , volatile void * const lhs , const volatile void * const rhs )
+    {
+      f.join( (volatile T *)lhs , (const volatile T *)rhs );
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+namespace Kokkos {
+
+namespace Impl {
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+  template<typename ValueType, class JoinOp, class Enable = void>
+  struct JoinLambdaAdapter {
+    typedef ValueType value_type;
+    const JoinOp& lambda;
+    KOKKOS_INLINE_FUNCTION
+    JoinLambdaAdapter(const JoinOp& lambda_):lambda(lambda_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile value_type& dst, const volatile value_type& src) const {
+      lambda(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(value_type& dst, const value_type& src) const {
+      lambda(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (volatile value_type& dst, const volatile value_type& src) const {
+      lambda(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (value_type& dst, const value_type& src) const {
+      lambda(dst,src);
+    }
+  };
+
+  template<typename ValueType, class JoinOp>
+  struct JoinLambdaAdapter<ValueType, JoinOp, decltype( FunctorValueJoinFunction< JoinOp , void >::enable_if( & JoinOp::join ) )> {
+    typedef ValueType value_type;
+    typedef StaticAssertSame<ValueType,typename JoinOp::value_type> assert_value_types_match;
+    const JoinOp& lambda;
+    KOKKOS_INLINE_FUNCTION
+    JoinLambdaAdapter(const JoinOp& lambda_):lambda(lambda_) {}
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile value_type& dst, const volatile value_type& src) const {
+      lambda.join(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void join(value_type& dst, const value_type& src) const {
+      lambda.join(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (volatile value_type& dst, const volatile value_type& src) const {
+      lambda.join(dst,src);
+    }
+
+    KOKKOS_INLINE_FUNCTION
+    void operator() (value_type& dst, const value_type& src) const {
+      lambda.join(dst,src);
+    }
+  };
+
+#endif
+
+  template<typename ValueType>
+  struct JoinAdd {
+    typedef ValueType value_type;
+
+    KOKKOS_INLINE_FUNCTION
+    JoinAdd() {}
+
+    KOKKOS_INLINE_FUNCTION
+    void join(volatile value_type& dst, const volatile value_type& src) const {
+      dst+=src;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator() (value_type& dst, const value_type& src) const {
+      dst+=src;
+    }
+    KOKKOS_INLINE_FUNCTION
+    void operator() (volatile value_type& dst, const volatile value_type& src) const {
+      dst+=src;
+    }
+  };
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ArgTag
+        , class T = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type >
+struct FunctorValueOps ;
+
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueOps< FunctorType , ArgTag , T & >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * pointer( T & r ) { return & r ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T & reference( void * p ) { return *((T*)p); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void copy( const FunctorType & , void * const lhs , const void * const rhs )
+    { *((T*)lhs) = *((const T*)rhs); }
+};
+
+/* No 'join' function provided, array of values */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorValueOps< FunctorType , ArgTag , T * >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * pointer( T * p ) { return p ; }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  T * reference( void * p ) { return ((T*)p); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void copy( const FunctorType & f , void * const lhs , const void * const rhs )
+    {
+      const int n = FunctorValueTraits<FunctorType,ArgTag>::value_count(f);
+      for ( int i = 0 ; i < n ; ++i ) { ((T*)lhs)[i] = ((const T*)rhs)[i]; }
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+// Compatible functions for 'final' function and value_type not an array
+template< class FunctorType , class ArgTag , bool IsArray = 0 == FunctorValueTraits<FunctorType,ArgTag>::StaticValueSize >
+struct FunctorFinalFunction {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type & ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile & ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const & ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile & ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const volatile & ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const volatile & ) );
+};
+
+// Compatible functions for 'final' function and value_type is an array
+template< class FunctorType , class ArgTag >
+struct FunctorFinalFunction< FunctorType , ArgTag , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,ArgTag>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type * ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type volatile * ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const * ) );
+
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile * ) const );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , value_type const volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , value_type const volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , value_type const volatile * ) );
+  // KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , value_type const volatile * ) );
+};
+
+template< class FunctorType >
+struct FunctorFinalFunction< FunctorType , void , false > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type & ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type & ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type & ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( const value_type & ) );
+};
+
+template< class FunctorType >
+struct FunctorFinalFunction< FunctorType , void , true > {
+
+  typedef typename FunctorValueTraits<FunctorType,void>::value_type value_type ;
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( value_type * ) );
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type * ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( const value_type * ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( const value_type * ) );
+};
+
+/* No 'final' function provided */
+template< class FunctorType , class ArgTag
+        , class ResultType = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type
+        , class Enable = void >
+struct FunctorFinal
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( const FunctorType & , void * ) {}
+};
+
+/* 'final' function provided */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorFinal
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::final does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when enable_if( & Functor::final ) does not exist
+  , decltype( FunctorFinalFunction< FunctorType , ArgTag >::enable_if( & FunctorType::final ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( const FunctorType & f , void * p ) { f.final( *((T*)p) ); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( FunctorType & f , void * p ) { f.final( *((T*)p) ); }
+};
+
+/* 'final' function provided for array value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorFinal
+  < FunctorType
+  , ArgTag
+  , T *
+    // First  substitution failure when FunctorType::final does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when enable_if( & Functor::final ) does not exist
+  , decltype( FunctorFinalFunction< FunctorType , ArgTag >::enable_if( & FunctorType::final ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::final ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( const FunctorType & f , void * p ) { f.final( (T*)p ); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void final( FunctorType & f , void * p ) { f.final( (T*)p ); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class FunctorType , class ArgTag
+        , class ReferenceType = typename FunctorValueTraits<FunctorType,ArgTag>::reference_type >
+struct FunctorApplyFunction {
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , ReferenceType ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , ReferenceType ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag         , ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ArgTag const & , ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag         , ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ArgTag const & , ReferenceType ) );
+};
+
+template< class FunctorType , class ReferenceType >
+struct FunctorApplyFunction< FunctorType , void , ReferenceType > {
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ReferenceType ) const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)( ReferenceType ) );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (             *)( ReferenceType ) );
+};
+
+template< class FunctorType >
+struct FunctorApplyFunction< FunctorType , void , void > {
+
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)() const );
+  KOKKOS_INLINE_FUNCTION static void enable_if( void (FunctorType::*)() );
+};
+
+template< class FunctorType , class ArgTag , class ReferenceType
+        , class Enable = void >
+struct FunctorApply
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( const FunctorType & , void * ) {}
+};
+
+/* 'apply' function provided for void value */
+template< class FunctorType , class ArgTag >
+struct FunctorApply
+  < FunctorType
+  , ArgTag
+  , void
+    // First  substitution failure when FunctorType::apply does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when enable_if( & Functor::apply ) does not exist
+  , decltype( FunctorApplyFunction< FunctorType , ArgTag , void >::enable_if( & FunctorType::apply ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::apply ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( FunctorType & f ) { f.apply(); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( const FunctorType & f ) { f.apply(); }
+};
+
+/* 'apply' function provided for single value */
+template< class FunctorType , class ArgTag , class T >
+struct FunctorApply
+  < FunctorType
+  , ArgTag
+  , T &
+    // First  substitution failure when FunctorType::apply does not exist.
+#if defined( KOKKOS_HAVE_CXX11 )
+    // Second substitution failure when enable_if( & Functor::apply ) does not exist
+  , decltype( FunctorApplyFunction< FunctorType , ArgTag >::enable_if( & FunctorType::apply ) )
+#else
+  , typename Impl::enable_if< 0 < sizeof( & FunctorType::apply ) >::type
+#endif
+  >
+{
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( const FunctorType & f , void * p ) { f.apply( *((T*)p) ); }
+
+  KOKKOS_FORCEINLINE_FUNCTION static
+  void apply( FunctorType & f , void * p ) { f.apply( *((T*)p) ); }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* KOKKOS_FUNCTORADAPTER_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..5c6a5b03b1ca07d6d1b6ba73f07e05a74f71f675
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp
@@ -0,0 +1,455 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <Kokkos_Macros.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( __INTEL_COMPILER ) && ! defined ( KOKKOS_HAVE_CUDA )
+
+// Intel specialized allocator does not interoperate with CUDA memory allocation
+
+#define KOKKOS_INTEL_MM_ALLOC_AVAILABLE
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
+    ( defined( _XOPEN_SOURCE )   && _XOPEN_SOURCE   >= 600 )
+
+#define KOKKOS_POSIX_MEMALIGN_AVAILABLE
+
+#include <unistd.h>
+#include <sys/mman.h>
+
+/* mmap flags for private anonymous memory allocation */
+
+#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
+  #define KOKKOS_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
+#elif defined( MAP_ANON ) && defined( MAP_PRIVATE )
+  #define KOKKOS_POSIX_MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
+#endif
+
+// mmap flags for huge page tables
+#if defined( KOKKOS_POSIX_MMAP_FLAGS )
+  #if defined( MAP_HUGETLB )
+    #define KOKKOS_POSIX_MMAP_FLAGS_HUGE (KOKKOS_POSIX_MMAP_FLAGS | MAP_HUGETLB )
+  #else
+    #define KOKKOS_POSIX_MMAP_FLAGS_HUGE KOKKOS_POSIX_MMAP_FLAGS
+  #endif
+#endif
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <memory.h>
+
+#include <iostream>
+#include <sstream>
+#include <cstring>
+
+#include <Kokkos_HostSpace.hpp>
+#include <impl/Kokkos_BasicAllocators.hpp>
+#include <impl/Kokkos_Error.hpp>
+#include <Kokkos_Atomic.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+
+
+DeepCopy<HostSpace,HostSpace>::DeepCopy( void * dst , const void * src , size_t n )
+{
+  memcpy( dst , src , n );
+}
+
+}
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace {
+
+static const int QUERY_SPACE_IN_PARALLEL_MAX = 16 ;
+
+typedef int (* QuerySpaceInParallelPtr )();
+
+QuerySpaceInParallelPtr s_in_parallel_query[ QUERY_SPACE_IN_PARALLEL_MAX ] ;
+int s_in_parallel_query_count = 0 ;
+
+} // namespace <empty>
+
+void HostSpace::register_in_parallel( int (*device_in_parallel)() )
+{
+  if ( 0 == device_in_parallel ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel ERROR : given NULL" ) );
+  }
+
+  int i = -1 ;
+
+  if ( ! (device_in_parallel)() ) {
+    for ( i = 0 ; i < s_in_parallel_query_count && ! (*(s_in_parallel_query[i]))() ; ++i );
+  }
+
+  if ( i < s_in_parallel_query_count ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : called in_parallel" ) );
+
+  }
+
+  if ( QUERY_SPACE_IN_PARALLEL_MAX <= i ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::HostSpace::register_in_parallel_query ERROR : exceeded maximum" ) );
+
+  }
+
+  for ( i = 0 ; i < s_in_parallel_query_count && s_in_parallel_query[i] != device_in_parallel ; ++i );
+
+  if ( i == s_in_parallel_query_count ) {
+    s_in_parallel_query[s_in_parallel_query_count++] = device_in_parallel ;
+  }
+}
+
+int HostSpace::in_parallel()
+{
+  const int n = s_in_parallel_query_count ;
+
+  int i = 0 ;
+
+  while ( i < n && ! (*(s_in_parallel_query[i]))() ) { ++i ; }
+
+  return i < n ;
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+Impl::AllocationTracker HostSpace::allocate_and_track( const std::string & label, const size_t size )
+{
+  return Impl::AllocationTracker( allocator(), size, label );
+}
+
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+/* Default allocation mechanism */
+HostSpace::HostSpace()
+  : m_alloc_mech(
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+      HostSpace::INTEL_MM_ALLOC
+#elif defined( KOKKOS_POSIX_MMAP_FLAGS )
+      HostSpace::POSIX_MMAP
+#elif defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+      HostSpace::POSIX_MEMALIGN
+#else
+      HostSpace::STD_MALLOC
+#endif
+    )
+{}
+
+/* Default allocation mechanism */
+HostSpace::HostSpace( const HostSpace::AllocationMechanism & arg_alloc_mech )
+  : m_alloc_mech( HostSpace::STD_MALLOC )
+{
+  if ( arg_alloc_mech == STD_MALLOC ) {
+    m_alloc_mech = HostSpace::STD_MALLOC ;
+  }
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+  else if ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) {
+    m_alloc_mech = HostSpace::INTEL_MM_ALLOC ;
+  }
+#elif defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+  else if ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) {
+    m_alloc_mech = HostSpace::POSIX_MEMALIGN ;
+  }
+#elif defined( KOKKOS_POSIX_MMAP_FLAGS )
+  else if ( arg_alloc_mech == HostSpace::POSIX_MMAP ) {
+    m_alloc_mech = HostSpace::POSIX_MMAP ;
+  }
+#endif
+  else {
+    const char * const mech =
+      ( arg_alloc_mech == HostSpace::INTEL_MM_ALLOC ) ? "INTEL_MM_ALLOC" : (
+      ( arg_alloc_mech == HostSpace::POSIX_MEMALIGN ) ? "POSIX_MEMALIGN" : (
+      ( arg_alloc_mech == HostSpace::POSIX_MMAP     ) ? "POSIX_MMAP" : "" ));
+
+    std::string msg ;
+    msg.append("Kokkos::HostSpace ");
+    msg.append(mech);
+    msg.append(" is not available" );
+    Kokkos::Impl::throw_runtime_exception( msg );
+  }
+}
+
+void * HostSpace::allocate( const size_t arg_alloc_size ) const
+{
+  static_assert( sizeof(void*) == sizeof(uintptr_t)
+               , "Error sizeof(void*) != sizeof(uintptr_t)" );
+
+  static_assert( Kokkos::Impl::power_of_two< Kokkos::Impl::MEMORY_ALIGNMENT >::value
+               , "Memory alignment must be power of two" );
+
+  constexpr size_t alignment = Kokkos::Impl::MEMORY_ALIGNMENT ;
+  constexpr size_t alignment_mask = alignment - 1 ;
+
+  void * ptr = NULL;
+
+  if ( arg_alloc_size ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      // Over-allocate to and round up to guarantee proper alignment.
+      size_t size_padded = arg_alloc_size + sizeof(void*) + alignment ;
+
+      void * alloc_ptr = malloc( size_padded );
+
+      if (alloc_ptr) {
+        uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
+
+        // offset enough to record the alloc_ptr
+        address += sizeof(void *);
+        uintptr_t rem = address % alignment;
+        uintptr_t offset = rem ? (alignment - rem) : 0u;
+        address += offset;
+        ptr = reinterpret_cast<void *>(address);
+        // record the alloc'd pointer
+        address -= sizeof(void *);
+        *reinterpret_cast<void **>(address) = alloc_ptr;
+      }
+    }
+
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+    else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
+      ptr = _mm_malloc( arg_alloc_size , alignment );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+    else if ( m_alloc_mech == POSIX_MEMALIGN ) {
+      posix_memalign( & ptr, alignment , arg_alloc_size );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MMAP_FLAGS )
+    else if ( m_alloc_mech == POSIX_MMAP ) {
+      constexpr size_t use_huge_pages = (1u << 27);
+      constexpr int    prot  = PROT_READ | PROT_WRITE ;
+      const     int    flags = arg_alloc_size < use_huge_pages
+                             ? KOKKOS_POSIX_MMAP_FLAGS
+                             : KOKKOS_POSIX_MMAP_FLAGS_HUGE ;
+
+      // read write access to private memory
+
+      ptr = mmap( NULL /* address hint, if NULL OS kernel chooses address */
+                , arg_alloc_size /* size in bytes */
+                , prot           /* memory protection */
+                , flags          /* visibility of updates */
+                , -1 /* file descriptor */
+                ,  0 /* offset */
+                );
+
+/* Associated reallocation:
+       ptr = mremap( old_ptr , old_size , new_size , MREMAP_MAYMOVE );
+*/
+    }
+#endif
+  }
+
+  if ( reinterpret_cast<uintptr_t>(ptr) & alignment_mask ) {
+    Kokkos::Impl::throw_runtime_exception( "Kokkos::HostSpace aligned allocation failed" );
+  }
+
+  return ptr;
+}
+
+
+void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const
+{
+  if ( arg_alloc_ptr ) {
+
+    if ( m_alloc_mech == STD_MALLOC ) {
+      void * alloc_ptr = *(reinterpret_cast<void **>(arg_alloc_ptr) -1);
+      free( alloc_ptr );
+    }    
+
+#if defined( KOKKOS_INTEL_MM_ALLOC_AVAILABLE )
+    else if ( m_alloc_mech == INTEL_MM_ALLOC ) {
+      _mm_free( arg_alloc_ptr );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MEMALIGN_AVAILABLE )
+    else if ( m_alloc_mech == POSIX_MEMALIGN ) {
+      free( arg_alloc_ptr );
+    }
+#endif
+
+#if defined( KOKKOS_POSIX_MMAP_FLAGS )
+    else if ( m_alloc_mech == POSIX_MMAP ) {
+      munmap( arg_alloc_ptr , arg_alloc_size );
+    }
+#endif
+
+  }
+}
+
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+SharedAllocationRecord< void , void >
+SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record ;
+
+void
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+deallocate( SharedAllocationRecord< void , void > * arg_rec )
+{
+  delete static_cast<SharedAllocationRecord*>(arg_rec);
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+~SharedAllocationRecord()
+{
+  m_space.deallocate( SharedAllocationRecord< void , void >::m_alloc_ptr
+                    , SharedAllocationRecord< void , void >::m_alloc_size
+                    );
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void >::
+SharedAllocationRecord( const Kokkos::HostSpace & arg_space
+                      , const std::string       & arg_label
+                      , const size_t              arg_alloc_size
+                      , const SharedAllocationRecord< void , void >::function_type arg_dealloc
+                      )
+  // Pass through allocated [ SharedAllocationHeader , user_memory ]
+  // Pass through deallocation function
+  : SharedAllocationRecord< void , void >
+      ( & SharedAllocationRecord< Kokkos::HostSpace , void >::s_root_record
+      , reinterpret_cast<SharedAllocationHeader*>( arg_space.allocate( sizeof(SharedAllocationHeader) + arg_alloc_size ) )
+      , sizeof(SharedAllocationHeader) + arg_alloc_size
+      , arg_dealloc
+      )
+  , m_space( arg_space )
+{
+  // Fill in the Header information
+  RecordBase::m_alloc_ptr->m_record = static_cast< SharedAllocationRecord< void , void > * >( this );
+
+  strncpy( RecordBase::m_alloc_ptr->m_label
+          , arg_label.c_str()
+          , SharedAllocationHeader::maximum_label_length
+          );
+}
+
+SharedAllocationRecord< Kokkos::HostSpace , void > *
+SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr )
+{
+  typedef SharedAllocationHeader  Header ;
+  typedef SharedAllocationRecord< Kokkos::HostSpace , void >  RecordHost ;
+
+  SharedAllocationHeader const * const head   = Header::get_header( alloc_ptr );
+  RecordHost                   * const record = static_cast< RecordHost * >( head->m_record );
+
+  if ( record->m_alloc_ptr != head ) {
+    Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void >::get_record ERROR" ) );
+  }
+
+  return record ;
+}
+
+// Iterate records to print orphaned memory ...
+void SharedAllocationRecord< Kokkos::HostSpace , void >::
+print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail )
+{
+  SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail );
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace {
+  const unsigned HOST_SPACE_ATOMIC_MASK = 0xFFFF;
+  const unsigned HOST_SPACE_ATOMIC_XOR_MASK = 0x5A39;
+  static int HOST_SPACE_ATOMIC_LOCKS[HOST_SPACE_ATOMIC_MASK+1];
+}
+
+namespace Impl {
+void init_lock_array_host_space() {
+  static int is_initialized = 0;
+  if(! is_initialized)
+    for(int i = 0; i < static_cast<int> (HOST_SPACE_ATOMIC_MASK+1); i++)
+      HOST_SPACE_ATOMIC_LOCKS[i] = 0;
+}
+
+bool lock_address_host_space(void* ptr) {
+  return 0 == atomic_compare_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                                  0 , 1);
+}
+
+void unlock_address_host_space(void* ptr) {
+   atomic_exchange( &HOST_SPACE_ATOMIC_LOCKS[
+      (( size_t(ptr) >> 2 ) & HOST_SPACE_ATOMIC_MASK) ^ HOST_SPACE_ATOMIC_XOR_MASK] ,
+                    0);
+}
+
+}
+}
diff --git a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..17eb0c2f4b4d25f3f738e97465b24c34c39de22d
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp
@@ -0,0 +1,73 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE )
+#define KOKKOS_MEMORY_FENCE
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+KOKKOS_FORCEINLINE_FUNCTION
+void memory_fence()
+{
+#if defined( KOKKOS_ATOMICS_USE_CUDA )
+  __threadfence();
+#elif defined( KOKKOS_ATOMICS_USE_GCC ) || \
+      ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ATOMICS_USE_INTEL ) )
+  __sync_synchronize();
+#elif defined( KOKKOS_ATOMICS_USE_INTEL )
+  _mm_mfence();
+#elif defined( KOKKOS_ATOMICS_USE_OMP31 )
+  #pragma omp flush
+#elif defined( KOKKOS_ATOMICS_USE_WINDOWS )
+  MemoryBarrier();
+#else
+ #error "Error: memory_fence() not defined"
+#endif
+}
+
+} // namespace kokkos
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..0e87c63e4469e93496074a73f92a98b27b642c61
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_PhysicalLayout.hpp
@@ -0,0 +1,84 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_PHYSICAL_LAYOUT_HPP
+#define KOKKOS_PHYSICAL_LAYOUT_HPP
+
+
+#include <Kokkos_View.hpp>
+namespace Kokkos {
+namespace Impl {
+
+
+
+struct PhysicalLayout {
+  enum LayoutType {Left,Right,Scalar,Error};
+  LayoutType layout_type;
+  int rank;
+  long long int stride[8]; //distance between two neighboring elements in a given dimension
+
+  template< class T , class L , class D , class M >
+  PhysicalLayout( const View<T,L,D,M,ViewDefault> & view )
+    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
+                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
+    , rank( view.Rank )
+    {
+      for(int i=0;i<8;i++) stride[i] = 0;
+      view.stride( stride );
+    }
+  #ifdef KOKKOS_HAVE_CUDA
+  template< class T , class L , class D , class M >
+  PhysicalLayout( const View<T,L,D,M,ViewCudaTexture> & view )
+    : layout_type( is_same< typename View<T,L,D,M>::array_layout , LayoutLeft  >::value ? Left : (
+                   is_same< typename View<T,L,D,M>::array_layout , LayoutRight >::value ? Right : Error ))
+    , rank( view.Rank )
+    {
+      for(int i=0;i<8;i++) stride[i] = 0;
+      view.stride( stride );
+    }
+  #endif
+};
+
+}
+}
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..5da60841d4376e45baf5c0733cb23c8449278ba3
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_DeviceInfo.hpp
@@ -0,0 +1,57 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+*/
+
+#ifndef KOKKOSP_DEVICE_INFO_HPP
+#define KOKKOSP_DEVICE_INFO_HPP
+
+namespace Kokkos {
+namespace Experimental {
+
+    struct KokkosPDeviceInfo {
+        uint32_t deviceID;
+    };
+
+}
+}
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..85ec1709c61ca9c7e4020ea01b1da6e06df6a836
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp
@@ -0,0 +1,141 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+ */
+
+#include <impl/Kokkos_Profiling_Interface.hpp>
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+#include <string.h>
+
+namespace Kokkos {
+  namespace Experimental {
+    bool profileLibraryLoaded() {
+       	return (NULL != initProfileLibrary);
+    }
+
+    void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+        if(NULL != beginForCallee) {
+            Kokkos::fence();
+            (*beginForCallee)(kernelPrefix.c_str(), devID, kernelID);
+        }
+    };
+
+    void endParallelFor(const uint64_t kernelID) {
+        if(NULL != endForCallee) {
+            Kokkos::fence();
+            (*endForCallee)(kernelID);
+        }
+    };
+
+    void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+        if(NULL != beginScanCallee) {
+            Kokkos::fence();
+            (*beginScanCallee)(kernelPrefix.c_str(), devID, kernelID);
+        }
+    };
+
+    void endParallelScan(const uint64_t kernelID) {
+        if(NULL != endScanCallee) {
+            Kokkos::fence();
+            (*endScanCallee)(kernelID);
+        }
+    };
+    
+    void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID) {
+        if(NULL != beginReduceCallee) {
+            Kokkos::fence();
+            (*beginReduceCallee)(kernelPrefix.c_str(), devID, kernelID);
+        }
+    };
+    
+    void endParallelReduce(const uint64_t kernelID) {
+        if(NULL != endReduceCallee) {
+            Kokkos::fence();
+            (*endReduceCallee)(kernelID);
+        }
+    };
+    
+    void initialize() {
+        void* firstProfileLibrary;
+
+        char* envProfileLibrary  = getenv("KOKKOS_PROFILE_LIBRARY");
+	char* profileLibraryName = strtok(envProfileLibrary, ";");
+
+        if( (NULL != profileLibraryName) && (strcmp(profileLibraryName, "") != 0) ) {
+            firstProfileLibrary = dlopen(profileLibraryName, RTLD_NOW | RTLD_GLOBAL);
+
+            if(NULL == firstProfileLibrary) {
+                std::cerr << "Error: Unable to load KokkosP library: " <<
+                profileLibraryName << std::endl;
+            } else {
+                std::cout << "KOKKOSP: Library Loaded: " << profileLibraryName << std::endl;
+
+                beginForCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_for");
+                beginScanCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_scan");
+                beginReduceCallee = (beginFunction) dlsym(firstProfileLibrary, "kokkosp_begin_parallel_reduce");
+
+                endScanCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_scan");
+                endForCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_for");
+                endReduceCallee = (endFunction) dlsym(firstProfileLibrary, "kokkosp_end_parallel_reduce");
+
+                initProfileLibrary = (initFunction) dlsym(firstProfileLibrary, "kokkosp_init_library");
+                finalizeProfileLibrary = (finalizeFunction) dlsym(firstProfileLibrary, "kokkosp_finalize_library");
+            }
+        }
+
+        if(NULL != initProfileLibrary) {
+            (*initProfileLibrary)(0,
+		(uint64_t) KOKKOSP_INTERFACE_VERSION,
+		(uint32_t) 0,
+		NULL);
+        }
+    };
+
+    void finalize() {
+        if(NULL != finalizeProfileLibrary) {
+            (*finalizeProfileLibrary)();
+        }
+    };
+  }
+}
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..1e2f715f36d9d6275c42ada1bc11291a1c18b628
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp
@@ -0,0 +1,98 @@
+/*
+ //@HEADER
+ // ************************************************************************
+ //
+ //                        Kokkos v. 2.0
+ //              Copyright (2014) Sandia Corporation
+ //
+ // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+ // the U.S. Government retains certain rights in this software.
+ //
+ // Redistribution and use in source and binary forms, with or without
+ // modification, are permitted provided that the following conditions are
+ // met:
+ //
+ // 1. Redistributions of source code must retain the above copyright
+ // notice, this list of conditions and the following disclaimer.
+ //
+ // 2. Redistributions in binary form must reproduce the above copyright
+ // notice, this list of conditions and the following disclaimer in the
+ // documentation and/or other materials provided with the distribution.
+ //
+ // 3. Neither the name of the Corporation nor the names of the
+ // contributors may be used to endorse or promote products derived from
+ // this software without specific prior written permission.
+ //
+ // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+ // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+ // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ //
+ // Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+ //
+ // ************************************************************************
+ //@HEADER
+ */
+
+#ifndef KOKKOSP_INTERFACE_HPP
+#define KOKKOSP_INTERFACE_HPP
+
+#include <cstddef>
+#include <Kokkos_Core_fwd.hpp>
+#include <Kokkos_Macros.hpp>
+#include <string>
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+#include <impl/Kokkos_Profiling_DeviceInfo.hpp>
+#include <dlfcn.h>
+#include <iostream>
+#include <stdlib.h>
+#endif
+
+#define KOKKOSP_INTERFACE_VERSION 20150628
+
+#ifdef KOKKOSP_ENABLE_PROFILING
+namespace Kokkos {
+  namespace Experimental {
+
+    typedef void (*initFunction)(const int,
+	const uint64_t,
+	const uint32_t,
+	KokkosPDeviceInfo*);
+    typedef void (*finalizeFunction)();
+    typedef void (*beginFunction)(const char*, const uint32_t, uint64_t*);
+    typedef void (*endFunction)(uint64_t);
+
+    static initFunction initProfileLibrary = NULL;
+    static finalizeFunction finalizeProfileLibrary = NULL;
+    static beginFunction beginForCallee = NULL;
+    static beginFunction beginScanCallee = NULL;
+    static beginFunction beginReduceCallee = NULL;
+    static endFunction endForCallee = NULL;
+    static endFunction endScanCallee = NULL;
+    static endFunction endReduceCallee = NULL;
+
+    bool profileLibraryLoaded();
+
+    void beginParallelFor(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+    void endParallelFor(const uint64_t kernelID);
+    void beginParallelScan(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+    void endParallelScan(const uint64_t kernelID);
+    void beginParallelReduce(const std::string& kernelPrefix, const uint32_t devID, uint64_t* kernelID);
+    void endParallelReduce(const uint64_t kernelID);
+
+    void initialize();
+    void finalize();
+
+  }
+}
+
+#endif
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..562c7afc6de5e3b6913671e52abc5157dc61c6d5
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp
@@ -0,0 +1,119 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdlib.h>
+#include <sstream>
+#include <Kokkos_Serial.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL )
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+namespace SerialImpl {
+
+Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
+
+Sentinel::~Sentinel()
+{
+  if ( m_scratch ) { free( m_scratch ); }
+  m_scratch = 0 ;
+  m_reduce_end = 0 ;
+  m_shared_end = 0 ;
+}
+
+Sentinel & Sentinel::singleton()
+{
+  static Sentinel s ; return s ;
+}
+
+inline
+unsigned align( unsigned n )
+{
+  enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
+  return ( n + MASK ) & ~MASK ;
+}
+
+} // namespace
+
+SerialTeamMember::SerialTeamMember( int arg_league_rank
+                                  , int arg_league_size
+                                  , int arg_shared_size
+                                  )
+  : m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
+           , arg_shared_size )
+  , m_league_rank( arg_league_rank )
+  , m_league_size( arg_league_size )
+{}
+
+} // namespace Impl
+
+void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
+{
+  static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
+
+  reduce_size = Impl::SerialImpl::align( reduce_size );
+  shared_size = Impl::SerialImpl::align( shared_size );
+
+  if ( ( s.m_reduce_end < reduce_size ) ||
+       ( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
+
+    if ( s.m_scratch ) { free( s.m_scratch ); }
+
+    if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
+    if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
+
+    s.m_scratch = malloc( s.m_shared_end );
+  }
+
+  return s.m_scratch ;
+}
+
+} // namespace Kokkos
+
+#endif // defined( KOKKOS_HAVE_SERIAL )
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..688f97f42e2f9cc41e4a1353a58a277edb49c905
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
@@ -0,0 +1,336 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#include <impl/Kokkos_Serial_TaskPolicy.hpp>
+
+#if defined( KOKKOS_HAVE_SERIAL )
+#include <stdlib.h>
+#include <stdexcept>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+TaskPolicy< Kokkos::Serial >::member_type &
+TaskPolicy< Kokkos::Serial >::member_single()
+{
+  static member_type s(0,1,0); 
+  return s ;
+}
+
+} // namespace Experimental
+} // namespace Kokkos
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+typedef TaskMember<  Kokkos::Serial , void , void > Task ;
+
+//----------------------------------------------------------------------------
+
+namespace {
+
+inline
+unsigned padded_sizeof_derived( unsigned sizeof_derived )
+{
+  return sizeof_derived +
+    ( sizeof_derived % sizeof(Task*) ? sizeof(Task*) - sizeof_derived % sizeof(Task*) : 0 );
+}
+
+} // namespace
+
+void Task::deallocate( void * ptr )
+{
+  free( ptr );
+}
+
+void * Task::allocate( const unsigned arg_sizeof_derived
+                     , const unsigned arg_dependence_capacity )
+{
+  return malloc( padded_sizeof_derived( arg_sizeof_derived ) + arg_dependence_capacity * sizeof(Task*) );
+}
+
+Task::~TaskMember()
+{
+
+}
+
+Task::TaskMember( const Task::function_verify_type   arg_verify
+                , const Task::function_dealloc_type  arg_dealloc
+                , const Task::function_apply_type    arg_apply
+                , const unsigned                     arg_sizeof_derived
+                , const unsigned                     arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  arg_verify )
+  , m_apply(   arg_apply )
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_wait( 0 )
+  , m_next( 0 )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( TASK_STATE_CONSTRUCTING )
+{
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+Task::TaskMember( const Task::function_dealloc_type  arg_dealloc
+                , const Task::function_apply_type    arg_apply
+                , const unsigned                     arg_sizeof_derived
+                , const unsigned                     arg_dependence_capacity
+                )
+  : m_dealloc( arg_dealloc )
+  , m_verify(  & Task::verify_type<void> )
+  , m_apply(   arg_apply )
+  , m_dep( (Task **)( ((unsigned char *) this) + padded_sizeof_derived( arg_sizeof_derived ) ) )
+  , m_wait( 0 )
+  , m_next( 0 )
+  , m_dep_capacity( arg_dependence_capacity )
+  , m_dep_size( 0 )
+  , m_ref_count( 0 )
+  , m_state( TASK_STATE_CONSTRUCTING )
+{
+  for ( unsigned i = 0 ; i < arg_dependence_capacity ; ++i ) m_dep[i] = 0 ;
+}
+
+//----------------------------------------------------------------------------
+
+void Task::throw_error_add_dependence() const
+{
+  std::cerr << "TaskMember< Serial >::add_dependence ERROR"
+            << " state(" << m_state << ")"
+            << " dep_size(" << m_dep_size << ")"
+            << std::endl ;
+  throw std::runtime_error("TaskMember< Serial >::add_dependence ERROR");
+}
+
+void Task::throw_error_verify_type()
+{
+  throw std::runtime_error("TaskMember< Serial >::verify_type ERROR");
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
+{
+  static const char msg_error_header[]      = "Kokkos::Experimental::Impl::TaskManager<Kokkos::Serial>::assign ERROR" ;
+  static const char msg_error_count[]       = ": negative reference count" ;
+  static const char msg_error_complete[]    = ": destroy task that is not complete" ;
+  static const char msg_error_dependences[] = ": destroy task that has dependences" ;
+  static const char msg_error_exception[]   = ": caught internal exception" ;
+
+  const char * msg_error = 0 ;
+
+  try {
+
+    if ( *lhs ) {
+
+      const int count = --((**lhs).m_ref_count);
+
+      if ( 0 == count ) {
+
+        // Reference count at zero, delete it
+
+        // Should only be deallocating a completed task
+        if ( (**lhs).m_state == Kokkos::Experimental::TASK_STATE_COMPLETE ) {
+
+          // A completed task should not have dependences...
+          for ( int i = 0 ; i < (**lhs).m_dep_size && 0 == msg_error ; ++i ) {
+            if ( (**lhs).m_dep[i] ) msg_error = msg_error_dependences ;
+          }
+        }
+        else {
+          msg_error = msg_error_complete ;
+        }
+
+        if ( 0 == msg_error ) {
+          // Get deletion function and apply it
+          const Task::function_dealloc_type d = (**lhs).m_dealloc ;
+
+          (*d)( *lhs );
+        }
+      }
+      else if ( count <= 0 ) {
+        msg_error = msg_error_count ;
+      }
+    }
+
+    if ( 0 == msg_error && rhs ) { ++( rhs->m_ref_count ); }
+
+    *lhs = rhs ;
+  }
+  catch( ... ) {
+    if ( 0 == msg_error ) msg_error = msg_error_exception ;
+  }
+
+  if ( 0 != msg_error ) {
+    if ( no_throw ) {
+      std::cerr << msg_error_header << msg_error << std::endl ;
+      std::cerr.flush();
+    }
+    else {
+      std::string msg(msg_error_header);
+      msg.append(msg_error);
+      throw std::runtime_error( msg );
+    }
+  }
+}
+#endif
+
+namespace {
+
+Task * s_ready = 0 ;
+Task * s_denied = reinterpret_cast<Task*>( ~((unsigned long)0) );
+
+}
+
+void Task::schedule()
+{
+  // Execute ready tasks in case the task being scheduled
+  // is dependent upon a waiting and ready task.
+
+  Task::execute_ready_tasks();
+
+  // spawning   : Constructing -> Waiting
+  // respawning : Executing    -> Waiting
+  // updating   : Waiting      -> Waiting
+
+  // Must not be in a dependence linked list:  0 == t->m_next
+
+  const bool ok_state = TASK_STATE_COMPLETE != m_state ;
+  const bool ok_list  = 0 == m_next ;
+
+  if ( ok_state && ok_list ) {
+
+    // Will be waiting for execution upon return from this function
+
+    m_state = Kokkos::Experimental::TASK_STATE_WAITING ;
+
+    // Insert this task into another dependence that is not complete
+
+    int i = 0 ;
+    for ( ; i < m_dep_size ; ++i ) {
+      Task * const y = m_dep[i] ;
+      if ( y && s_denied != ( m_next = y->m_wait ) ) {
+        y->m_wait = this ; // CAS( & y->m_wait , m_next , this );
+        break ;
+      }
+    }
+    if ( i == m_dep_size ) {
+      // All dependences are complete, insert into the ready list
+      m_next  = s_ready ;
+      s_ready = this ; // CAS( & s_ready , m_next = s_ready , this );
+    }
+  }
+  else {
+    throw std::runtime_error(std::string("Kokkos::Experimental::Impl::Task spawn or respawn state error"));
+  }
+}
+
+void Task::execute_ready_tasks()
+{
+  while ( s_ready ) {
+
+    // Remove this task from the ready list
+
+    // Task * task ;
+    // while ( ! CAS( & s_ready , task = s_ready , s_ready->m_next ) );
+
+    Task * const task = s_ready ;
+    s_ready = task->m_next ;
+
+    task->m_next = 0 ;
+
+    // precondition: task->m_state = TASK_STATE_WAITING
+    // precondition: task->m_dep[i]->m_state == TASK_STATE_COMPLETE  for all i
+    // precondition: does not exist T such that T->m_wait = task
+    // precondition: does not exist T such that T->m_next = task
+
+    task->m_state = Kokkos::Experimental::TASK_STATE_EXECUTING ;
+
+    (*task->m_apply)( task );
+
+    if ( task->m_state == Kokkos::Experimental::TASK_STATE_EXECUTING ) {
+      // task did not respawn itself
+      task->m_state = Kokkos::Experimental::TASK_STATE_COMPLETE ;
+
+      // release dependences:
+      for ( int i = 0 ; i < task->m_dep_size ; ++i ) {
+        assign( task->m_dep + i , 0 );
+      }
+
+      // Stop other tasks from adding themselves to 'task->m_wait' ;
+
+      Task * x ;
+      // CAS( & task->m_wait , x = task->m_wait , s_denied );
+      x = task->m_wait ; task->m_wait = s_denied ;
+
+      // update tasks waiting on this task
+      while ( x ) {
+        Task * const next = x->m_next ;
+
+        x->m_next = 0 ;
+
+        x->schedule(); // could happen concurrently
+
+        x = next ;
+      }
+    }
+  }
+}
+
+} // namespace Impl
+} // namespace Experimental
+} // namespace Kokkos
+
+#endif // defined( KOKKOS_HAVE_SERIAL )
diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..4eec2f66bed30d1286bd97298625e51772781195
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Serial_TaskPolicy.hpp
@@ -0,0 +1,845 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+// Experimental unified task-data parallel manycore LDRD
+
+#ifndef KOKKOS_SERIAL_TASKPOLICY_HPP
+#define KOKKOS_SERIAL_TASKPOLICY_HPP
+
+#include <Kokkos_Macros.hpp>
+#if defined( KOKKOS_HAVE_SERIAL )
+
+#include <string>
+#include <typeinfo>
+#include <stdexcept>
+
+#include <Kokkos_Serial.hpp>
+#include <Kokkos_TaskPolicy.hpp>
+#include <Kokkos_View.hpp>
+
+#include <impl/Kokkos_FunctorAdapter.hpp>
+
+//----------------------------------------------------------------------------
+/*  Inheritance structure to allow static_cast from the task root type
+ *  and a task's FunctorType.
+ *
+ *    task_root_type == TaskMember< Space , void , void >
+ *
+ *    TaskMember< PolicyType , ResultType , FunctorType >
+ *      : TaskMember< PolicyType::Space , ResultType , FunctorType >
+ *      { ... };
+ *
+ *    TaskMember< Space , ResultType , FunctorType >
+ *      : TaskMember< Space , ResultType , void >
+ *      , FunctorType
+ *      { ... };
+ *
+ *  when ResultType != void
+ *
+ *    TaskMember< Space , ResultType , void >
+ *      : TaskMember< Space , void , void >
+ *      { ... };
+ *
+ */
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+namespace Impl {
+
+/** \brief  Base class for all tasks in the Serial execution space */
+template<>
+class TaskMember< Kokkos::Serial , void , void >
+{
+public:
+
+  typedef void         (* function_apply_type)  ( TaskMember * );
+  typedef void         (* function_dealloc_type)( TaskMember * );
+  typedef TaskMember * (* function_verify_type) ( TaskMember * );
+
+private:
+
+  const function_dealloc_type  m_dealloc ; ///< Deallocation
+  const function_verify_type   m_verify ;  ///< Result type verification
+  const function_apply_type    m_apply ;   ///< Apply function
+  TaskMember ** const          m_dep ;     ///< Dependences
+  TaskMember *                 m_wait ;    ///< Linked list of tasks waiting on this task
+  TaskMember *                 m_next ;    ///< Linked list of tasks waiting on a different task
+  const int                    m_dep_capacity ; ///< Capacity of dependences
+  int                          m_dep_size ;     ///< Actual count of dependences
+  int                          m_ref_count ;    ///< Reference count
+  int                          m_state ;        ///< State of the task
+
+  // size = 6 Pointers + 4 ints
+
+  TaskMember() /* = delete */ ;
+  TaskMember( const TaskMember & ) /* = delete */ ;
+  TaskMember & operator = ( const TaskMember & ) /* = delete */ ;
+
+  static void * allocate( const unsigned arg_sizeof_derived , const unsigned arg_dependence_capacity );
+  static void   deallocate( void * );
+
+  void throw_error_add_dependence() const ;
+  static void throw_error_verify_type();
+
+  template < class DerivedTaskType >
+  static
+  void deallocate( TaskMember * t )
+    {
+      DerivedTaskType * ptr = static_cast< DerivedTaskType * >(t);
+      ptr->~DerivedTaskType();
+      deallocate( (void *) ptr );
+    }
+
+protected :
+
+  ~TaskMember();
+
+  // Used by TaskMember< Serial , ResultType , void >
+  TaskMember( const function_verify_type   arg_verify
+            , const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            );
+
+  // Used for TaskMember< Serial , void , void >
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            );
+
+public:
+
+  template< typename ResultType >
+  KOKKOS_FUNCTION static
+  TaskMember * verify_type( TaskMember * t )
+    {
+      enum { check_type = ! Kokkos::Impl::is_same< ResultType , void >::value };
+
+      if ( check_type && t != 0 ) {
+
+        // Verify that t->m_verify is this function
+        const function_verify_type self = & TaskMember::template verify_type< ResultType > ;
+
+        if ( t->m_verify != self ) {
+          t = 0 ;
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+          throw_error_verify_type();
+#endif
+        }
+      }
+      return t ;
+    }
+
+  //----------------------------------------
+  /*  Inheritence Requirements on task types:
+   *    typedef  FunctorType::value_type  value_type ;
+   *    class DerivedTaskType
+   *      : public TaskMember< Serial , value_type , FunctorType >
+   *      { ... };
+   *    class TaskMember< Serial , value_type , FunctorType >
+   *      : public TaskMember< Serial , value_type , void >
+   *      , public Functor
+   *      { ... };
+   *  If value_type != void
+   *    class TaskMember< Serial , value_type , void >
+   *      : public TaskMember< Serial , void , void >
+   *
+   *  Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
+   *
+   */
+
+  /** \brief  Allocate and construct a single-thread task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create( const typename DerivedTaskType::functor_type &  arg_functor
+                     , const unsigned                                  arg_dependence_capacity
+                     )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , & TaskMember::template apply_single< functor_type , value_type >
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  /** \brief  Allocate and construct a data parallel task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create( const typename DerivedTaskType::policy_type &   arg_policy
+                     , const typename DerivedTaskType::functor_type &  arg_functor
+                     , const unsigned                                  arg_dependence_capacity
+                     )
+    {
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_policy
+                         , arg_functor
+                         );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  /** \brief  Allocate and construct a thread-team task */
+  template< class DerivedTaskType >
+  static
+  TaskMember * create_team( const typename DerivedTaskType::functor_type &  arg_functor
+                          , const unsigned                                  arg_dependence_capacity
+                          )
+    {
+      typedef typename DerivedTaskType::functor_type  functor_type ;
+      typedef typename functor_type::value_type       value_type ;
+
+      DerivedTaskType * const task =
+        new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
+          DerivedTaskType( & TaskMember::template deallocate< DerivedTaskType >
+                         , & TaskMember::template apply_team< functor_type , value_type >
+                         , sizeof(DerivedTaskType)
+                         , arg_dependence_capacity
+                         , arg_functor );
+
+      return static_cast< TaskMember * >( task );
+    }
+
+  void schedule();
+  static void execute_ready_tasks();
+
+  //----------------------------------------
+
+  typedef FutureValueTypeIsVoidError get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return get_result_type() ; }
+
+  KOKKOS_INLINE_FUNCTION
+  Kokkos::Experimental::TaskState get_state() const { return Kokkos::Experimental::TaskState( m_state ); }
+
+  //----------------------------------------
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+  static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false );
+#else
+  KOKKOS_INLINE_FUNCTION static
+  void assign( TaskMember ** const lhs , TaskMember * const rhs , const bool no_throw = false ) {}
+#endif
+
+  KOKKOS_INLINE_FUNCTION
+  TaskMember * get_dependence( int i ) const
+    { return ( Kokkos::Experimental::TASK_STATE_EXECUTING == m_state && 0 <= i && i < m_dep_size ) ? m_dep[i] : (TaskMember*) 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence() const
+    { return m_dep_size ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence()
+    {
+      for ( int i = 0 ; i < m_dep_size ; ++i ) assign( m_dep + i , 0 );
+      m_dep_size = 0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( TaskMember * before )
+    {
+      if ( ( Kokkos::Experimental::TASK_STATE_CONSTRUCTING == m_state ||
+             Kokkos::Experimental::TASK_STATE_EXECUTING    == m_state ) &&
+           m_dep_size < m_dep_capacity ) {
+        assign( m_dep + m_dep_size , before );
+        ++m_dep_size ;
+      }
+      else {
+        throw_error_add_dependence();
+      }
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , ResultType & >::apply( (FunctorType &) m , & m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  KOKKOS_INLINE_FUNCTION static
+  void apply_single( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      Kokkos::Impl::FunctorApply< FunctorType , void , void >::apply( (FunctorType &) m );
+    }
+
+  //----------------------------------------
+
+  template< class FunctorType , class ResultType >
+  static
+  void apply_team( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+      typedef Kokkos::Impl::SerialTeamMember                          member_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member_type(0,1,0) , m.m_result );
+    }
+
+  template< class FunctorType , class ResultType >
+  static
+  void apply_team( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same< ResultType , void >::value , TaskMember * >::type t )
+    {
+      typedef TaskMember< Kokkos::Serial , ResultType , FunctorType > derived_type ;
+      typedef Kokkos::Impl::SerialTeamMember                          member_type ;
+
+      // TaskMember< Kokkos::Serial , ResultType , FunctorType >
+      //   : public TaskMember< Kokkos::Serial , ResultType , void >
+      //   , public FunctorType
+      //   { ... };
+
+      derived_type & m = * static_cast< derived_type * >( t );
+
+      m.FunctorType::apply( member_type(0,1,0) );
+    }
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Base class for tasks with a result value in the Serial execution space.
+ *
+ *  The FunctorType must be void because this class is accessed by the
+ *  Future class for the task and result value.
+ *
+ *  Must be derived from TaskMember<S,void,void> 'root class' so the Future class
+ *  can correctly static_cast from the 'root class' to this class.
+ */
+template < class ResultType >
+class TaskMember< Kokkos::Serial , ResultType , void >
+  : public TaskMember< Kokkos::Serial , void , void >
+{
+public:
+
+  ResultType  m_result ;
+
+  typedef const ResultType & get_result_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  get_result_type get() const { return m_result ; }
+
+protected:
+
+  typedef TaskMember< Kokkos::Serial , void , void >  task_root_type ;
+  typedef task_root_type::function_dealloc_type       function_dealloc_type ;
+  typedef task_root_type::function_apply_type         function_apply_type ;
+
+  inline
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            )
+    : task_root_type( & task_root_type::template verify_type< ResultType >
+                    , arg_dealloc
+                    , arg_apply
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity )
+    , m_result()
+    {}
+};
+
+template< class ResultType , class FunctorType >
+class TaskMember< Kokkos::Serial , ResultType , FunctorType >
+  : public TaskMember< Kokkos::Serial , ResultType , void >
+  , public FunctorType
+{
+public:
+
+  typedef FunctorType  functor_type ;
+
+  typedef TaskMember< Kokkos::Serial , void , void >        task_root_type ;
+  typedef TaskMember< Kokkos::Serial , ResultType , void >  task_base_type ;
+  typedef task_root_type::function_dealloc_type             function_dealloc_type ;
+  typedef task_root_type::function_apply_type               function_apply_type ;
+
+  inline
+  TaskMember( const function_dealloc_type  arg_dealloc
+            , const function_apply_type    arg_apply
+            , const unsigned               arg_sizeof_derived
+            , const unsigned               arg_dependence_capacity
+            , const functor_type &         arg_functor
+            )
+    : task_base_type( arg_dealloc , arg_apply , arg_sizeof_derived , arg_dependence_capacity )
+    , functor_type( arg_functor )
+    {}
+};
+
+//----------------------------------------------------------------------------
+/** \brief  ForEach task in the Serial execution space
+ *
+ *  Derived from TaskMember< Kokkos::Serial , ResultType , FunctorType >
+ *  so that Functor can be cast to task root type without knowing policy.
+ */
+template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType >
+class TaskForEach< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
+                 , ResultType
+                 , FunctorType >
+  : TaskMember< Kokkos::Serial , ResultType , FunctorType >
+{
+public:
+
+  typedef FunctorType                                              functor_type ;
+  typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >       policy_type ;
+
+private:
+
+  friend class Kokkos::Experimental::TaskPolicy< Kokkos::Serial > ;
+  friend class Kokkos::Experimental::Impl::TaskMember< Kokkos::Serial , void , void > ;
+
+  typedef TaskMember< Kokkos::Serial , void , void >               task_root_type ;
+  typedef TaskMember< Kokkos::Serial , ResultType , FunctorType >  task_base_type ;
+  typedef task_root_type::function_dealloc_type                    function_dealloc_type ;
+
+  policy_type  m_policy ;
+
+  template< class Tag >
+  inline
+  typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same<Tag,void>::value >::type
+    apply_policy() const
+    {
+      const typename policy_type::member_type e = m_policy.end();
+      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        functor_type::operator()(i);
+      }
+    }
+
+  template< class Tag >
+  inline
+  typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same<Tag,void>::value >::type
+    apply_policy() const
+    {
+      const Tag tag ;
+      const typename policy_type::member_type e = m_policy.end();
+      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        functor_type::operator()(tag,i);
+      }
+    }
+
+  static
+  void apply_parallel( task_root_type * t )
+    {
+      static_cast<TaskForEach*>(t)->template apply_policy< typename policy_type::work_tag >();
+
+      task_root_type::template apply_single< functor_type , ResultType >( t );
+    }
+
+  TaskForEach( const function_dealloc_type  arg_dealloc
+             , const int                    arg_sizeof_derived
+             , const int                    arg_dependence_capacity
+             , const policy_type &          arg_policy
+             , const functor_type &         arg_functor
+             )
+    : task_base_type( arg_dealloc
+                    , & apply_parallel
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity
+                    , arg_functor )
+    , m_policy( arg_policy )
+    {}
+
+  TaskForEach() /* = delete */ ;
+  TaskForEach( const TaskForEach & ) /* = delete */ ;
+  TaskForEach & operator = ( const TaskForEach & ) /* = delete */ ;
+};
+
+//----------------------------------------------------------------------------
+/** \brief  Reduce task in the Serial execution space
+ *
+ *  Derived from TaskMember< Kokkos::Serial , ResultType , FunctorType >
+ *  so that Functor can be cast to task root type without knowing policy.
+ */
+template< class Arg0 , class Arg1 , class Arg2 , class ResultType , class FunctorType >
+class TaskReduce< Kokkos::RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >
+                , ResultType
+                , FunctorType >
+  : TaskMember< Kokkos::Serial , ResultType , FunctorType >
+{
+public:
+
+  typedef FunctorType                                              functor_type ;
+  typedef RangePolicy< Arg0 , Arg1 , Arg2 , Kokkos::Serial >       policy_type ;
+
+private:
+
+  friend class Kokkos::Experimental::TaskPolicy< Kokkos::Serial > ;
+  friend class Kokkos::Experimental::Impl::TaskMember< Kokkos::Serial , void , void > ;
+
+  typedef TaskMember< Kokkos::Serial , void , void >               task_root_type ;
+  typedef TaskMember< Kokkos::Serial , ResultType , FunctorType >  task_base_type ;
+  typedef task_root_type::function_dealloc_type                    function_dealloc_type ;
+
+  policy_type  m_policy ;
+
+  template< class Tag >
+  inline
+  void apply_policy( typename Kokkos::Impl::enable_if< Kokkos::Impl::is_same<Tag,void>::value , ResultType & >::type result ) const
+    {
+      Kokkos::Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result );
+      const typename policy_type::member_type e = m_policy.end();
+      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        functor_type::operator()( i, result );
+      }
+    }
+
+  template< class Tag >
+  inline
+  void apply_policy( typename Kokkos::Impl::enable_if< ! Kokkos::Impl::is_same<Tag,void>::value , ResultType & >::type result ) const
+    {
+      Kokkos::Impl::FunctorValueInit< functor_type , Tag >::init( *this , & result );
+      const Tag tag ;
+      const typename policy_type::member_type e = m_policy.end();
+      for ( typename policy_type::member_type i = m_policy.begin() ; i < e ; ++i ) {
+        functor_type::operator()( tag, i, result );
+      }
+    }
+
+  static
+  void apply_parallel( task_root_type * t )
+    {
+      TaskReduce * const task = static_cast<TaskReduce*>(t);
+
+      task->template apply_policy< typename policy_type::work_tag >( task->task_base_type::m_result );
+
+      task_root_type::template apply_single< functor_type , ResultType >( t );
+    }
+
+  TaskReduce( const function_dealloc_type  arg_dealloc
+            , const int                    arg_sizeof_derived
+            , const int                    arg_dependence_capacity
+            , const policy_type &          arg_policy
+            , const functor_type &         arg_functor
+            )
+    : task_base_type( arg_dealloc
+                    , & apply_parallel
+                    , arg_sizeof_derived
+                    , arg_dependence_capacity
+                    , arg_functor )
+    , m_policy( arg_policy )
+    {}
+
+  TaskReduce() /* = delete */ ;
+  TaskReduce( const TaskReduce & ) /* = delete */ ;
+  TaskReduce & operator = ( const TaskReduce & ) /* = delete */ ;
+};
+
+} /* namespace Impl */
+} /* namespace Experimental */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Experimental {
+
+template<>
+class TaskPolicy< Kokkos::Serial >
+{
+public:
+
+  typedef Kokkos::Serial                  execution_space ;
+  typedef Kokkos::Impl::SerialTeamMember  member_type ;
+
+private:
+
+  typedef Impl::TaskMember< execution_space , void , void > task_root_type ;
+
+  template< class FunctorType >
+  static inline
+  const task_root_type * get_task_root( const FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< const task_root_type * >( static_cast< const task_type * >(f) );
+    }
+
+  template< class FunctorType >
+  static inline
+  task_root_type * get_task_root( FunctorType * f )
+    {
+      typedef Impl::TaskMember< execution_space , typename FunctorType::value_type , FunctorType > task_type ;
+      return static_cast< task_root_type * >( static_cast< task_type * >(f) );
+    }
+
+  unsigned m_default_dependence_capacity ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy() : m_default_dependence_capacity(4) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy & rhs ) : m_default_dependence_capacity( rhs.m_default_dependence_capacity ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  explicit
+  TaskPolicy( const unsigned arg_default_dependence_capacity )
+    : m_default_dependence_capacity( arg_default_dependence_capacity ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  TaskPolicy( const TaskPolicy &
+            , const unsigned arg_default_dependence_capacity )
+    : m_default_dependence_capacity( arg_default_dependence_capacity ) {}
+
+  TaskPolicy & operator = ( const TaskPolicy &rhs ) 
+    {
+      m_default_dependence_capacity = rhs.m_default_dependence_capacity;
+      return *this;
+    }
+
+  //----------------------------------------
+
+  template< class ValueType >
+  KOKKOS_INLINE_FUNCTION
+  const Future< ValueType , execution_space > &
+    spawn( const Future< ValueType , execution_space > & f ) const
+      {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        f.m_task->schedule();
+#endif
+        return f ;
+      }
+
+  // Create single-thread task
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create( const FunctorType & functor
+        , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type >(
+          functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_team( const FunctorType & functor
+             , const unsigned dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskMember< execution_space , value_type , FunctorType >  task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create_team< task_type >(
+          functor , ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
+#endif
+        );
+    }
+
+  // Create parallel foreach task
+
+  template< class PolicyType , class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_foreach( const PolicyType  & policy
+                , const FunctorType & functor
+                , const unsigned      dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskForEach< PolicyType , value_type , FunctorType > task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type >( policy , functor ,
+          ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
+#endif
+       );
+    }
+
+  // Create parallel reduce task
+
+  template< class PolicyType , class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< typename FunctorType::value_type , execution_space >
+  create_reduce( const PolicyType  & policy
+               , const FunctorType & functor
+               , const unsigned      dependence_capacity = ~0u ) const
+    {
+      typedef typename FunctorType::value_type value_type ;
+      typedef Impl::TaskReduce< PolicyType , value_type , FunctorType > task_type ;
+      return Future< value_type , execution_space >(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        task_root_type::create< task_type >( policy , functor ,
+          ( ~0u == dependence_capacity ? m_default_dependence_capacity : dependence_capacity ) )
+#endif
+        );
+    }
+
+  // Add dependence
+  template< class A1 , class A2 , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( const Future<A1,A2> & after
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A1,A2>::execution_space , execution_space >::value
+                          &&
+                          Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+    {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+      after.m_task->add_dependence( before.m_task );
+#endif
+    }
+
+  //----------------------------------------
+  // Functions for an executing task functor to query dependences,
+  // set new dependences, and respawn itself.
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  Future< void , execution_space >
+  get_dependence( const FunctorType * task_functor , int i ) const
+    {
+      return Future<void,execution_space>(
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+        get_task_root(task_functor)->get_dependence(i)
+#endif
+        );
+    }
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  int get_dependence( const FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { return get_task_root(task_functor)->get_dependence(); }
+#else
+    { return 0 ; }
+#endif
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void clear_dependence( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->clear_dependence(); }
+#else
+    {}
+#endif
+
+  template< class FunctorType , class A3 , class A4 >
+  KOKKOS_INLINE_FUNCTION
+  void add_dependence( FunctorType * task_functor
+                     , const Future<A3,A4> & before
+                     , typename Kokkos::Impl::enable_if
+                        < Kokkos::Impl::is_same< typename Future<A3,A4>::execution_space , execution_space >::value
+                        >::type * = 0
+                      ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->add_dependence( before.m_task ); }
+#else
+    {}
+#endif
+
+  template< class FunctorType >
+  KOKKOS_INLINE_FUNCTION
+  void respawn( FunctorType * task_functor ) const
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { get_task_root(task_functor)->schedule(); }
+#else
+    {}
+#endif
+
+  //----------------------------------------
+
+  static member_type & member_single();
+};
+
+inline
+void wait( TaskPolicy< Kokkos::Serial > & )
+{ Impl::TaskMember< Kokkos::Serial , void , void >::execute_ready_tasks(); }
+
+} /* namespace Experimental */
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+
+#endif /* defined( KOKKOS_HAVE_SERIAL ) */
+#endif /* #define KOKKOS_SERIAL_TASK_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Shape.cpp b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..da12db1f381e790e46604f8a15280d2a07f5152a
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.cpp
@@ -0,0 +1,178 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#include <sstream>
+#include <impl/Kokkos_Error.hpp>
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+void assert_counts_are_equal_throw(
+  const size_t x_count ,
+  const size_t y_count )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_counts_are_equal_throw( "
+      << x_count << " != " << y_count << " )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void assert_shapes_are_equal_throw(
+  const unsigned x_scalar_size ,
+  const unsigned x_rank ,
+  const size_t   x_N0 , const unsigned x_N1 ,
+  const unsigned x_N2 , const unsigned x_N3 ,
+  const unsigned x_N4 , const unsigned x_N5 ,
+  const unsigned x_N6 , const unsigned x_N7 ,
+
+  const unsigned y_scalar_size ,
+  const unsigned y_rank ,
+  const size_t   y_N0 , const unsigned y_N1 ,
+  const unsigned y_N2 , const unsigned y_N3 ,
+  const unsigned y_N4 , const unsigned y_N5 ,
+  const unsigned y_N6 , const unsigned y_N7 )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_shape_are_equal_throw( {"
+      << " scalar_size(" << x_scalar_size
+      << ") rank(" << x_rank
+      << ") dimension(" ;
+  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
+  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
+  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
+  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
+  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
+  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
+  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
+  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
+  msg << " ) } != { "
+      << " scalar_size(" << y_scalar_size
+      << ") rank(" << y_rank
+      << ") dimension(" ;
+  if ( 0 < y_rank ) { msg << " " << y_N0 ; }
+  if ( 1 < y_rank ) { msg << " " << y_N1 ; }
+  if ( 2 < y_rank ) { msg << " " << y_N2 ; }
+  if ( 3 < y_rank ) { msg << " " << y_N3 ; }
+  if ( 4 < y_rank ) { msg << " " << y_N4 ; }
+  if ( 5 < y_rank ) { msg << " " << y_N5 ; }
+  if ( 6 < y_rank ) { msg << " " << y_N6 ; }
+  if ( 7 < y_rank ) { msg << " " << y_N7 ; }
+  msg << " ) } )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void AssertShapeBoundsAbort< Kokkos::HostSpace >::apply(
+  const size_t rank ,
+  const size_t n0 , const size_t n1 , 
+  const size_t n2 , const size_t n3 ,
+  const size_t n4 , const size_t n5 ,
+  const size_t n6 , const size_t n7 ,
+
+  const size_t arg_rank ,
+  const size_t i0 , const size_t i1 ,
+  const size_t i2 , const size_t i3 ,
+  const size_t i4 , const size_t i5 ,
+  const size_t i6 , const size_t i7 )
+{
+  std::ostringstream msg ;
+  msg << "Kokkos::Impl::AssertShapeBoundsAbort( shape = {" ;
+  if ( 0 < rank ) { msg << " " << n0 ; }
+  if ( 1 < rank ) { msg << " " << n1 ; }
+  if ( 2 < rank ) { msg << " " << n2 ; }
+  if ( 3 < rank ) { msg << " " << n3 ; }
+  if ( 4 < rank ) { msg << " " << n4 ; }
+  if ( 5 < rank ) { msg << " " << n5 ; }
+  if ( 6 < rank ) { msg << " " << n6 ; }
+  if ( 7 < rank ) { msg << " " << n7 ; }
+  msg << " } index = {" ;
+  if ( 0 < arg_rank ) { msg << " " << i0 ; }
+  if ( 1 < arg_rank ) { msg << " " << i1 ; }
+  if ( 2 < arg_rank ) { msg << " " << i2 ; }
+  if ( 3 < arg_rank ) { msg << " " << i3 ; }
+  if ( 4 < arg_rank ) { msg << " " << i4 ; }
+  if ( 5 < arg_rank ) { msg << " " << i5 ; }
+  if ( 6 < arg_rank ) { msg << " " << i6 ; }
+  if ( 7 < arg_rank ) { msg << " " << i7 ; }
+  msg << " } )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+void assert_shape_effective_rank1_at_leastN_throw(
+  const size_t x_rank , const size_t x_N0 ,
+  const size_t x_N1 ,   const size_t x_N2 ,
+  const size_t x_N3 ,   const size_t x_N4 ,
+  const size_t x_N5 ,   const size_t x_N6 ,
+  const size_t x_N7 ,
+  const size_t N0 )
+{
+  std::ostringstream msg ;
+
+  msg << "Kokkos::Impl::assert_shape_effective_rank1_at_leastN_throw( shape = {" ;
+  if ( 0 < x_rank ) { msg << " " << x_N0 ; }
+  if ( 1 < x_rank ) { msg << " " << x_N1 ; }
+  if ( 2 < x_rank ) { msg << " " << x_N2 ; }
+  if ( 3 < x_rank ) { msg << " " << x_N3 ; }
+  if ( 4 < x_rank ) { msg << " " << x_N4 ; }
+  if ( 5 < x_rank ) { msg << " " << x_N5 ; }
+  if ( 6 < x_rank ) { msg << " " << x_N6 ; }
+  if ( 7 < x_rank ) { msg << " " << x_N7 ; }
+  msg << " } N = " << N0 << " )" ;
+
+  throw_runtime_exception( msg.str() );
+}
+
+
+
+}
+}
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Shape.hpp b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..dba73012701776b028b0f3cbc109e2b9c6231644
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Shape.hpp
@@ -0,0 +1,917 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SHAPE_HPP
+#define KOKKOS_SHAPE_HPP
+
+#include <typeinfo>
+#include <utility>
+#include <Kokkos_Core_fwd.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_StaticAssert.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+/** \brief  The shape of a Kokkos with dynamic and static dimensions.
+ *          Dynamic dimensions are member values and static dimensions are
+ *          'static const' values.
+ *
+ *  The upper bound on the array rank is eight.
+ */
+template< unsigned ScalarSize ,
+          unsigned Rank ,
+          unsigned s0  = 1 ,
+          unsigned s1  = 1 ,
+          unsigned s2  = 1 ,
+          unsigned s3  = 1 ,
+          unsigned s4  = 1 ,
+          unsigned s5  = 1 ,
+          unsigned s6  = 1 ,
+          unsigned s7  = 1 >
+struct Shape ;
+
+//----------------------------------------------------------------------------
+/** \brief  Shape equality if the value type, layout, and dimensions
+ *          are equal.
+ */
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+KOKKOS_INLINE_FUNCTION
+bool operator == ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  enum { same_size = xSize == ySize };
+  enum { same_rank = xRank == yRank };
+
+  return same_size && same_rank &&
+         size_t( x.N0 )   == size_t( y.N0 ) &&
+         unsigned( x.N1 ) == unsigned( y.N1 ) &&
+         unsigned( x.N2 ) == unsigned( y.N2 ) &&
+         unsigned( x.N3 ) == unsigned( y.N3 ) &&
+         unsigned( x.N4 ) == unsigned( y.N4 ) &&
+         unsigned( x.N5 ) == unsigned( y.N5 ) &&
+         unsigned( x.N6 ) == unsigned( y.N6 ) &&
+         unsigned( x.N7 ) == unsigned( y.N7 ) ;
+}
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize ,unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+KOKKOS_INLINE_FUNCTION
+bool operator != ( const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+                   const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{ return ! operator == ( x , y ); }
+
+//----------------------------------------------------------------------------
+
+void assert_counts_are_equal_throw(
+  const size_t x_count ,
+  const size_t y_count );
+
+inline
+void assert_counts_are_equal(
+  const size_t x_count ,
+  const size_t y_count )
+{
+  if ( x_count != y_count ) {
+    assert_counts_are_equal_throw( x_count , y_count );
+  }
+}
+
+void assert_shapes_are_equal_throw(
+  const unsigned x_scalar_size ,
+  const unsigned x_rank ,
+  const size_t   x_N0 , const unsigned x_N1 ,
+  const unsigned x_N2 , const unsigned x_N3 ,
+  const unsigned x_N4 , const unsigned x_N5 ,
+  const unsigned x_N6 , const unsigned x_N7 ,
+
+  const unsigned y_scalar_size ,
+  const unsigned y_rank ,
+  const size_t   y_N0 , const unsigned y_N1 ,
+  const unsigned y_N2 , const unsigned y_N3 ,
+  const unsigned y_N4 , const unsigned y_N5 ,
+  const unsigned y_N6 , const unsigned y_N7 );
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+inline
+void assert_shapes_are_equal(
+  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
+  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
+
+  if ( x != y ) {
+    assert_shapes_are_equal_throw(
+      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
+      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
+  }
+}
+
+template< unsigned xSize , unsigned xRank ,
+          unsigned xN0 , unsigned xN1 , unsigned xN2 , unsigned xN3 ,
+          unsigned xN4 , unsigned xN5 , unsigned xN6 , unsigned xN7 ,
+
+          unsigned ySize , unsigned yRank ,
+          unsigned yN0 , unsigned yN1 , unsigned yN2 , unsigned yN3 ,
+          unsigned yN4 , unsigned yN5 , unsigned yN6 , unsigned yN7 >
+void assert_shapes_equal_dimension(
+  const Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> & x ,
+  const Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> & y )
+{
+  typedef Shape<xSize,xRank,xN0,xN1,xN2,xN3,xN4,xN5,xN6,xN7> x_type ;
+  typedef Shape<ySize,yRank,yN0,yN1,yN2,yN3,yN4,yN5,yN6,yN7> y_type ;
+
+  // Omit comparison of scalar_size.
+  if ( unsigned( x.rank ) != unsigned( y.rank ) ||
+       size_t( x.N0 )   != size_t( y.N0 ) || 
+       unsigned( x.N1 ) != unsigned( y.N1 ) || 
+       unsigned( x.N2 ) != unsigned( y.N2 ) || 
+       unsigned( x.N3 ) != unsigned( y.N3 ) ||
+       unsigned( x.N4 ) != unsigned( y.N4 ) || 
+       unsigned( x.N5 ) != unsigned( y.N5 ) || 
+       unsigned( x.N6 ) != unsigned( y.N6 ) || 
+       unsigned( x.N7 ) != unsigned( y.N7 ) ) {
+    assert_shapes_are_equal_throw(
+      x_type::scalar_size, x_type::rank, x.N0, x.N1, x.N2, x.N3, x.N4, x.N5, x.N6, x.N7,
+      y_type::scalar_size, y_type::rank, y.N0, y.N1, y.N2, y.N3, y.N4, y.N5, y.N6, y.N7 );
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class ShapeType > struct assert_shape_is_rank_zero ;
+template< class ShapeType > struct assert_shape_is_rank_one ;
+
+template< unsigned Size >
+struct assert_shape_is_rank_zero< Shape<Size,0> >
+  : public true_type {};
+
+template< unsigned Size , unsigned s0 >
+struct assert_shape_is_rank_one< Shape<Size,1,s0> >
+  : public true_type {};
+
+//----------------------------------------------------------------------------
+
+/** \brief  Array bounds assertion templated on the execution space
+ *          to allow device-specific abort code.
+ */
+template< class Space >
+struct AssertShapeBoundsAbort ;
+
+template<>
+struct AssertShapeBoundsAbort< Kokkos::HostSpace >
+{
+  static void apply( const size_t rank ,
+                     const size_t n0 , const size_t n1 ,
+                     const size_t n2 , const size_t n3 ,
+                     const size_t n4 , const size_t n5 ,
+                     const size_t n6 , const size_t n7 ,
+                     const size_t arg_rank ,
+                     const size_t i0 , const size_t i1 ,
+                     const size_t i2 , const size_t i3 ,
+                     const size_t i4 , const size_t i5 ,
+                     const size_t i6 , const size_t i7 );
+};
+
+template< class ExecutionSpace >
+struct AssertShapeBoundsAbort
+{
+  KOKKOS_INLINE_FUNCTION
+  static void apply( const size_t rank ,
+                     const size_t n0 , const size_t n1 ,
+                     const size_t n2 , const size_t n3 ,
+                     const size_t n4 , const size_t n5 ,
+                     const size_t n6 , const size_t n7 ,
+                     const size_t arg_rank ,
+                     const size_t i0 , const size_t i1 ,
+                     const size_t i2 , const size_t i3 ,
+                     const size_t i4 , const size_t i5 ,
+                     const size_t i6 , const size_t i7 )
+    {
+      AssertShapeBoundsAbort< Kokkos::HostSpace >
+        ::apply( rank ,    n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 ,
+                 arg_rank, i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+    }
+};
+
+template< class ShapeType >
+KOKKOS_INLINE_FUNCTION
+void assert_shape_bounds( const ShapeType & shape ,
+                          const size_t arg_rank ,
+                          const size_t i0 ,
+                          const size_t i1 = 0 ,
+                          const size_t i2 = 0 ,
+                          const size_t i3 = 0 ,
+                          const size_t i4 = 0 ,
+                          const size_t i5 = 0 ,
+                          const size_t i6 = 0 ,
+                          const size_t i7 = 0 )
+{
+  // Must supply at least as many indices as ranks.
+  // Every index must be within bounds.
+  const bool ok = ShapeType::rank <= arg_rank &&
+                  i0 < shape.N0 && 
+                  i1 < shape.N1 &&
+                  i2 < shape.N2 &&
+                  i3 < shape.N3 &&
+                  i4 < shape.N4 &&
+                  i5 < shape.N5 &&
+                  i6 < shape.N6 &&
+                  i7 < shape.N7 ;
+
+  if ( ! ok ) {
+    AssertShapeBoundsAbort< Kokkos::Impl::ActiveExecutionMemorySpace >
+      ::apply( ShapeType::rank ,
+               shape.N0 , shape.N1 , shape.N2 , shape.N3 ,
+               shape.N4 , shape.N5 , shape.N6 , shape.N7 ,
+               arg_rank , i0 , i1 , i2 , i3 , i4 , i5 , i6 , i7 );
+  }
+}
+
+#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK )
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) assert_shape_bounds(S,1,I0);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) assert_shape_bounds(S,2,I0,I1);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) assert_shape_bounds(S,3,I0,I1,I2);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) assert_shape_bounds(S,4,I0,I1,I2,I3);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) assert_shape_bounds(S,5,I0,I1,I2,I3,I4);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) assert_shape_bounds(S,6,I0,I1,I2,I3,I4,I5);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) assert_shape_bounds(S,7,I0,I1,I2,I3,I4,I5,I6);
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) assert_shape_bounds(S,8,I0,I1,I2,I3,I4,I5,I6,I7);
+#else
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_1( S , I0 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_2( S , I0 , I1 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_3( S , I0 , I1 , I2 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_4( S , I0 , I1 , I2 , I3 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_5( S , I0 , I1 , I2 , I3 , I4 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_6( S , I0 , I1 , I2 , I3 , I4 , I5 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_7( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 ) /* */
+#define KOKKOS_ASSERT_SHAPE_BOUNDS_8( S , I0 , I1 , I2 , I3 , I4 , I5 , I6 , I7 ) /* */
+#endif
+
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Specialization and optimization for the Rank 0 shape.
+
+template < unsigned ScalarSize >
+struct Shape< ScalarSize , 0, 1,1,1,1, 1,1,1,1 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 0 };
+  enum { rank         = 0 };
+
+  enum { N0 = 1 };
+  enum { N1 = 1 };
+  enum { N2 = 1 };
+  enum { N3 = 1 };
+  enum { N4 = 1 };
+  enum { N5 = 1 };
+  enum { N6 = 1 };
+  enum { N7 = 1 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  {}
+};
+
+//----------------------------------------------------------------------------
+
+template< unsigned R > struct assign_shape_dimension ;
+
+#define KOKKOS_ASSIGN_SHAPE_DIMENSION( R ) \
+template<> \
+struct assign_shape_dimension< R > \
+{ \
+  template< class ShapeType > \
+  KOKKOS_INLINE_FUNCTION \
+  assign_shape_dimension( ShapeType & shape \
+                        , typename Impl::enable_if<( R < ShapeType::rank_dynamic ), size_t >::type n \
+                        ) { shape.N ## R = n ; } \
+};
+
+KOKKOS_ASSIGN_SHAPE_DIMENSION(0)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(1)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(2)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(3)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(4)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(5)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(6)
+KOKKOS_ASSIGN_SHAPE_DIMENSION(7)
+
+#undef KOKKOS_ASSIGN_SHAPE_DIMENSION
+
+//----------------------------------------------------------------------------
+// All-static dimension array
+
+template < unsigned ScalarSize ,
+           unsigned Rank ,
+           unsigned s0 ,
+           unsigned s1 ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape {
+
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 0 };
+  enum { rank         = Rank };
+
+  enum { N0 = s0 };
+  enum { N1 = s1 };
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  {}
+};
+
+// 1 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize ,
+           unsigned Rank ,
+           unsigned s1 ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,s1,s2,s3, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 1 };
+  enum { rank         = Rank };
+
+  size_t N0 ; // For 1 == dynamic_rank allow  N0 > 2^32
+
+  enum { N1 = s1 };
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               size_t n0 , unsigned = 0 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; }
+};
+
+// 2 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s2 ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,s2,s3, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 2 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+
+  enum { N2 = s2 };
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned = 0 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; }
+};
+
+// 3 == dynamic_rank <= rank <= 8
+template < unsigned Rank , unsigned ScalarSize ,
+           unsigned s3 ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,s3, s4,s5,s6,s7>
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 3 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+
+  enum { N3 = s3 };
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned = 0 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; }
+};
+
+// 4 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s4 ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank, 0,0,0,0, s4,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 4 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+
+  enum { N4 = s4 };
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned = 0 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; }
+};
+
+// 5 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s5 ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,s5,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 5 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+
+  enum { N5 = s5 };
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned = 0 , unsigned = 0 , unsigned = 0 )
+  { s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ; s.N4 = n4 ; }
+};
+
+// 6 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s6 ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,s6,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 6 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+
+  enum { N6 = s6 };
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 = 0 , unsigned = 0 , unsigned = 0 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ;
+  }
+};
+
+// 7 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize , unsigned Rank ,
+           unsigned s7 >
+struct Shape< ScalarSize , Rank , 0,0,0,0, 0,0,0,s7 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 7 };
+  enum { rank         = Rank };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+
+  enum { N7 = s7 };
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 , unsigned n6 , unsigned = 0 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ;
+  }
+};
+
+// 8 == dynamic_rank <= rank <= 8
+template < unsigned ScalarSize >
+struct Shape< ScalarSize , 8 , 0,0,0,0, 0,0,0,0 >
+{
+  enum { scalar_size   = ScalarSize };
+  enum { rank_dynamic = 8 };
+  enum { rank         = 8 };
+
+  unsigned N0 ;
+  unsigned N1 ;
+  unsigned N2 ;
+  unsigned N3 ;
+  unsigned N4 ;
+  unsigned N5 ;
+  unsigned N6 ;
+  unsigned N7 ;
+
+  KOKKOS_INLINE_FUNCTION
+  static
+  void assign( Shape & s ,
+               unsigned n0 , unsigned n1 , unsigned n2 , unsigned n3 ,
+               unsigned n4 , unsigned n5 , unsigned n6 , unsigned n7 )
+  {
+    s.N0 = n0 ; s.N1 = n1 ; s.N2 = n2 ; s.N3 = n3 ;
+    s.N4 = n4 ; s.N5 = n5 ; s.N6 = n6 ; s.N7 = n7 ;
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class ShapeType , unsigned N ,
+          unsigned R = ShapeType::rank_dynamic >
+struct ShapeInsert ;
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 0 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 N ,
+                 ShapeType::N0 ,
+                 ShapeType::N1 ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 1 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 N ,
+                 ShapeType::N1 ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 2 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N2 ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 3 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N3 ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 4 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N4 ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 5 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N5 ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 6 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N ,
+                 ShapeType::N6 > type ;
+};
+
+template< class ShapeType , unsigned N >
+struct ShapeInsert< ShapeType , N , 7 >
+{
+  typedef Shape< ShapeType::scalar_size ,
+                 ShapeType::rank + 1 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 0 ,
+                 N > type ;
+};
+
+//----------------------------------------------------------------------------
+
+template< class DstShape , class SrcShape ,
+          unsigned DstRankDynamic   = DstShape::rank_dynamic ,
+          bool     DstRankDynamicOK = unsigned(DstShape::rank_dynamic) >= unsigned(SrcShape::rank_dynamic) >
+struct ShapeCompatible { enum { value = false }; };
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 8 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 7 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 6 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 5 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 4 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 3 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 2 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 1 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+template< class DstShape , class SrcShape >
+struct ShapeCompatible< DstShape , SrcShape , 0 , true >
+{
+  enum { value = unsigned(DstShape::scalar_size) == unsigned(SrcShape::scalar_size) &&
+                 unsigned(DstShape::N0) == unsigned(SrcShape::N0) &&
+                 unsigned(DstShape::N1) == unsigned(SrcShape::N1) &&
+                 unsigned(DstShape::N2) == unsigned(SrcShape::N2) &&
+                 unsigned(DstShape::N3) == unsigned(SrcShape::N3) &&
+                 unsigned(DstShape::N4) == unsigned(SrcShape::N4) &&
+                 unsigned(DstShape::N5) == unsigned(SrcShape::N5) &&
+                 unsigned(DstShape::N6) == unsigned(SrcShape::N6) &&
+                 unsigned(DstShape::N7) == unsigned(SrcShape::N7) };
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< unsigned ScalarSize , unsigned Rank ,
+          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
+          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 ,
+          typename iType >
+KOKKOS_INLINE_FUNCTION
+size_t dimension( 
+  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape ,
+  const iType & r )
+{
+  return 0 == r ? shape.N0 : (
+         1 == r ? shape.N1 : (
+         2 == r ? shape.N2 : (
+         3 == r ? shape.N3 : (
+         4 == r ? shape.N4 : (
+         5 == r ? shape.N5 : (
+         6 == r ? shape.N6 : (
+         7 == r ? shape.N7 : 1 )))))));
+}
+
+template< unsigned ScalarSize , unsigned Rank ,
+          unsigned s0 , unsigned s1 , unsigned s2 , unsigned s3 ,
+          unsigned s4 , unsigned s5 , unsigned s6 , unsigned s7 >
+KOKKOS_INLINE_FUNCTION
+size_t cardinality_count(
+  const Shape<ScalarSize,Rank,s0,s1,s2,s3,s4,s5,s6,s7> & shape )
+{
+  return size_t(shape.N0) * shape.N1 * shape.N2 * shape.N3 *
+         shape.N4 * shape.N5 * shape.N6 * shape.N7 ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_CORESHAPE_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp b/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..86bc94ab0be9e8cfd00ea5a95cebc906bd3aa312
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Singleton.hpp
@@ -0,0 +1,55 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_SINGLETON_HPP
+#define KOKKOS_SINGLETON_HPP
+
+#include <Kokkos_Macros.hpp>
+#include <cstddef>
+
+namespace Kokkos { namespace Impl {
+
+
+}} // namespace Kokkos::Impl
+
+#endif // KOKKOS_SINGLETON_HPP
diff --git a/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..25e2ec9dc1849db862d9cb0d01bfd817c584b3b8
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_StaticAssert.hpp
@@ -0,0 +1,79 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_STATICASSERT_HPP
+#define KOKKOS_STATICASSERT_HPP
+
+namespace Kokkos {
+namespace Impl {
+
+template < bool , class T = void >
+struct StaticAssert ;
+
+template< class T >
+struct StaticAssert< true , T > {
+  typedef T type ;
+  static const bool value = true ;
+};
+
+template < class A , class B >
+struct StaticAssertSame ;
+
+template < class A >
+struct StaticAssertSame<A,A> { typedef A type ; };
+
+template < class A , class B >
+struct StaticAssertAssignable ;
+
+template < class A >
+struct StaticAssertAssignable<A,A> { typedef A type ; };
+
+template < class A >
+struct StaticAssertAssignable< const A , A > { typedef const A type ; };
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* KOKKOS_STATICASSERT_HPP */
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Tags.hpp b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..4885d37376e029e11aa8a67dd8ce8ef8f5c2ba7e
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Tags.hpp
@@ -0,0 +1,156 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_TAGS_HPP
+#define KOKKOS_TAGS_HPP
+
+#include <impl/Kokkos_Traits.hpp>
+#include <Kokkos_Core_fwd.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+//----------------------------------------------------------------------------
+
+template<class ExecutionSpace, class MemorySpace>
+struct Device {
+  typedef ExecutionSpace execution_space;
+  typedef MemorySpace memory_space;
+  typedef Device<execution_space,memory_space> device_type;
+};
+}
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class C , class Enable = void >
+struct is_memory_space : public bool_< false > {};
+
+template< class C , class Enable = void >
+struct is_execution_space : public bool_< false > {};
+
+template< class C , class Enable = void >
+struct is_execution_policy : public bool_< false > {};
+
+template< class C , class Enable = void >
+struct is_array_layout : public Impl::false_type {};
+
+template< class C , class Enable = void >
+struct is_memory_traits : public Impl::false_type {};
+
+
+template< class C >
+struct is_memory_space< C , typename Impl::enable_if_type< typename C::memory_space >::type >
+  : public bool_< Impl::is_same< C , typename C::memory_space >::value > {};
+
+template< class C >
+struct is_execution_space< C , typename Impl::enable_if_type< typename C::execution_space >::type >
+  : public bool_< Impl::is_same< C , typename C::execution_space >::value > {};
+
+template< class C >
+struct is_execution_policy< C , typename Impl::enable_if_type< typename C::execution_policy >::type >
+  : public bool_< Impl::is_same< C , typename C::execution_policy >::value > {};
+
+template< class C >
+struct is_array_layout< C , typename Impl::enable_if_type< typename C::array_layout >::type >
+  : public bool_< Impl::is_same< C , typename C::array_layout >::value > {};
+
+template< class C >
+struct is_memory_traits< C , typename Impl::enable_if_type< typename C::memory_traits >::type >
+  : public bool_< Impl::is_same< C , typename C::memory_traits >::value > {};
+
+
+//----------------------------------------------------------------------------
+
+template< class C , class Enable = void >
+struct is_space : public Impl::false_type {};
+
+template< class C >
+struct is_space< C
+                 , typename Impl::enable_if<(
+                     Impl::is_same< C , typename C::execution_space >::value ||
+                     Impl::is_same< C , typename C::memory_space    >::value ||
+                     Impl::is_same< C , Device<
+                                             typename C::execution_space,
+                                             typename C::memory_space> >::value
+                   )>::type
+                 >
+  : public Impl::true_type
+{
+  typedef typename C::execution_space  execution_space ;
+  typedef typename C::memory_space     memory_space ;
+
+  // The host_memory_space defines a space with host-resident memory.
+  // If the execution space's memory space is host accessible then use that execution space.
+  // else use the HostSpace.
+  typedef
+      typename Impl::if_c< Impl::is_same< memory_space , HostSpace >::value
+#ifdef KOKKOS_HAVE_CUDA
+                        || Impl::is_same< memory_space , CudaUVMSpace>::value
+                        || Impl::is_same< memory_space , CudaHostPinnedSpace>::value
+#endif
+                          , memory_space , HostSpace >::type
+      host_memory_space ;
+
+  // The host_execution_space defines a space which has access to HostSpace.
+  // If the execution space can access HostSpace then use that execution space.
+  // else use the DefaultHostExecutionSpace.
+#ifdef KOKKOS_HAVE_CUDA
+  typedef
+      typename Impl::if_c< Impl::is_same< execution_space , Cuda >::value
+                          , DefaultHostExecutionSpace , execution_space >::type
+      host_execution_space ;
+#else
+  typedef execution_space host_execution_space;
+#endif
+
+  typedef Device<host_execution_space,host_memory_space> host_mirror_space;
+};
+}
+}
+
+#endif
diff --git a/lib/kokkos/core/src/impl/Kokkos_Timer.hpp b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..80a326f0802d36e6092d96d0608c13353cc50ddb
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Timer.hpp
@@ -0,0 +1,115 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_IMPLWALLTIME_HPP
+#define KOKKOS_IMPLWALLTIME_HPP
+
+#include <stddef.h>
+
+#ifdef _MSC_VER
+#undef KOKKOS_USE_LIBRT
+#include <gettimeofday.c>
+#else
+#ifdef KOKKOS_USE_LIBRT
+#include <ctime>
+#else
+#include <sys/time.h>
+#endif
+#endif
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Time since construction */
+
+class Timer {
+private:
+  #ifdef KOKKOS_USE_LIBRT
+	struct timespec m_old;
+  #else
+	struct timeval m_old ;
+  #endif
+  Timer( const Timer & );
+  Timer & operator = ( const Timer & );
+public:
+
+  inline
+  void reset() {
+    #ifdef KOKKOS_USE_LIBRT
+	  clock_gettime(CLOCK_REALTIME, &m_old);
+    #else
+	  gettimeofday( & m_old , ((struct timezone *) NULL ) );
+    #endif
+  }
+
+  inline
+  ~Timer() {}
+
+  inline
+  Timer() { reset(); }
+
+  inline
+  double seconds() const
+  {
+    #ifdef KOKKOS_USE_LIBRT
+      struct timespec m_new;
+      clock_gettime(CLOCK_REALTIME, &m_new);
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_nsec - m_old.tv_nsec ) * 1.0e-9 );
+    #else
+      struct timeval m_new ;
+
+      ::gettimeofday( & m_new , ((struct timezone *) NULL ) );
+
+      return ( (double) ( m_new.tv_sec  - m_old.tv_sec ) ) +
+             ( (double) ( m_new.tv_usec - m_old.tv_usec ) * 1.0e-6 );
+    #endif
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+#endif /* #ifndef KOKKOS_IMPLWALLTIME_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..52358842f54f3dd3ce6f19e971a7c71d02488499
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp
@@ -0,0 +1,370 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOSTRAITS_HPP
+#define KOKKOSTRAITS_HPP
+
+#include <stddef.h>
+#include <stdint.h>
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+/* C++11 conformal compile-time type traits utilities.
+ * Prefer to use C++11 when portably available.
+ */
+//----------------------------------------------------------------------------
+// C++11 Helpers:
+
+template < class T , T v >
+struct integral_constant
+{
+  // Declaration of 'static const' causes an unresolved linker symbol in debug
+  // static const T value = v ;
+  enum { value = T(v) };
+  typedef T value_type;
+  typedef integral_constant<T,v> type;
+  KOKKOS_INLINE_FUNCTION operator T() { return v ; }
+};
+
+typedef integral_constant<bool,false> false_type ;
+typedef integral_constant<bool,true>  true_type ;
+
+//----------------------------------------------------------------------------
+// C++11 Type relationships:
+
+template< class X , class Y > struct is_same : public false_type {};
+template< class X >           struct is_same<X,X> : public true_type {};
+
+//----------------------------------------------------------------------------
+// C++11 Type properties:
+
+template <typename T> struct is_const : public false_type {};
+template <typename T> struct is_const<const T> : public true_type {};
+template <typename T> struct is_const<const T & > : public true_type {};
+
+template <typename T> struct is_array : public false_type {};
+template <typename T> struct is_array< T[] > : public true_type {};
+template <typename T, unsigned N > struct is_array< T[N] > : public true_type {};
+
+//----------------------------------------------------------------------------
+// C++11 Type transformations:
+
+template <typename T> struct remove_const { typedef T type; };
+template <typename T> struct remove_const<const T> { typedef T type; };
+template <typename T> struct remove_const<const T & > { typedef T & type; };
+
+template <typename T> struct add_const { typedef const T type; };
+template <typename T> struct add_const<T & > { typedef const T & type; };
+template <typename T> struct add_const<const T> { typedef const T type; };
+template <typename T> struct add_const<const T & > { typedef const T & type; };
+
+template <typename T> struct remove_reference { typedef T type ; };
+template <typename T> struct remove_reference< T & > { typedef T type ; };
+template <typename T> struct remove_reference< const T & > { typedef const T type ; };
+
+template <typename T> struct remove_extent { typedef T type ; };
+template <typename T> struct remove_extent<T[]> { typedef T type ; };
+template <typename T, unsigned N > struct remove_extent<T[N]> { typedef T type ; };
+
+//----------------------------------------------------------------------------
+// C++11 Other type generators:
+
+template< bool , class T , class F >
+struct condition { typedef F type ; };
+
+template< class T , class F >
+struct condition<true,T,F> { typedef T type ; };
+
+template< bool , class = void >
+struct enable_if ;
+
+template< class T >
+struct enable_if< true , T > { typedef T type ; };
+
+//----------------------------------------------------------------------------
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+// Other traits
+
+namespace Kokkos {
+namespace Impl {
+
+//----------------------------------------------------------------------------
+
+template< class , class T = void >
+struct enable_if_type { typedef T type ; };
+
+//----------------------------------------------------------------------------
+
+template< bool B >
+struct bool_ : public integral_constant<bool,B> {};
+
+template< unsigned I >
+struct unsigned_ : public integral_constant<unsigned,I> {};
+
+template< int I >
+struct int_ : public integral_constant<int,I> {};
+
+typedef bool_<true> true_;
+typedef bool_<false> false_;
+//----------------------------------------------------------------------------
+// if_
+
+template < bool Cond , typename TrueType , typename FalseType>
+struct if_c
+{
+  enum { value = Cond };
+
+  typedef FalseType type;
+
+
+  typedef typename remove_const<
+          typename remove_reference<type>::type >::type value_type ;
+
+  typedef typename add_const<value_type>::type const_value_type ;
+
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v ) { return v ; }
+
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
+
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const T & , const_value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & , value_type & v ) { return v ; }
+};
+
+template <typename TrueType, typename FalseType>
+struct if_c< true , TrueType , FalseType >
+{
+  enum { value = true };
+
+  typedef TrueType type;
+
+
+  typedef typename remove_const<
+          typename remove_reference<type>::type >::type value_type ;
+
+  typedef typename add_const<value_type>::type const_value_type ;
+
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v ) { return v ; }
+
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v ) { return v ; }
+
+  template< class T >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( const T & ) { value_type * ptr(0); return *ptr ; }
+
+
+  template< class F >
+  static KOKKOS_INLINE_FUNCTION
+  const_value_type & select( const_value_type & v , const F & ) { return v ; }
+
+  template< class F >
+  static KOKKOS_INLINE_FUNCTION
+  value_type & select( value_type & v , const F & ) { return v ; }
+};
+
+template< typename TrueType >
+struct if_c< false , TrueType , void >
+{
+  enum { value = false };
+
+  typedef void type ;
+  typedef void value_type ;
+};
+
+template< typename FalseType >
+struct if_c< true , void , FalseType >
+{
+  enum { value = true };
+
+  typedef void type ;
+  typedef void value_type ;
+};
+
+template <typename Cond, typename TrueType, typename FalseType>
+struct if_ : public if_c<Cond::value, TrueType, FalseType> {};
+
+//----------------------------------------------------------------------------
+
+// Allows aliased types:
+template< typename T >
+struct is_integral : public integral_constant< bool ,
+  (
+    Impl::is_same< T ,          char >::value ||
+    Impl::is_same< T , unsigned char >::value ||
+    Impl::is_same< T ,          short int >::value ||
+    Impl::is_same< T , unsigned short int >::value ||
+    Impl::is_same< T ,          int >::value ||
+    Impl::is_same< T , unsigned int >::value ||
+    Impl::is_same< T ,          long int >::value ||
+    Impl::is_same< T , unsigned long int >::value ||
+    Impl::is_same< T ,          long long int >::value ||
+    Impl::is_same< T , unsigned long long int >::value ||
+
+    Impl::is_same< T , int8_t   >::value ||
+    Impl::is_same< T , int16_t  >::value ||
+    Impl::is_same< T , int32_t  >::value ||
+    Impl::is_same< T , int64_t  >::value ||
+    Impl::is_same< T , uint8_t  >::value ||
+    Impl::is_same< T , uint16_t >::value ||
+    Impl::is_same< T , uint32_t >::value ||
+    Impl::is_same< T , uint64_t >::value 
+  )>
+{};
+
+//----------------------------------------------------------------------------
+
+
+template < size_t N >
+struct is_power_of_two
+{
+  enum type { value = (N > 0) && !(N & (N-1)) };
+};
+
+template < size_t N , bool OK = is_power_of_two<N>::value >
+struct power_of_two ;
+
+template < size_t N >
+struct power_of_two<N,true>
+{
+  enum type { value = 1+ power_of_two<(N>>1),true>::value };
+};
+
+template <>
+struct power_of_two<2,true>
+{
+  enum type { value = 1 };
+};
+
+template <>
+struct power_of_two<1,true>
+{
+  enum type { value = 0 };
+};
+
+/** \brief  If power of two then return power,
+ *          otherwise return ~0u.
+ */
+static KOKKOS_FORCEINLINE_FUNCTION
+unsigned power_of_two_if_valid( const unsigned N )
+{
+  unsigned p = ~0u ;
+  if ( N && ! ( N & ( N - 1 ) ) ) {
+#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_HAVE_CUDA )
+    p = __ffs(N) - 1 ;
+#elif defined( __GNUC__ ) || defined( __GNUG__ )
+    p = __builtin_ffs(N) - 1 ;
+#elif defined( __INTEL_COMPILER )
+    p = _bit_scan_forward(N);
+#else
+    p = 0 ;
+    for ( unsigned j = 1 ; ! ( N & j ) ; j <<= 1 ) { ++p ; }
+#endif
+  }
+  return p ;
+}
+
+//----------------------------------------------------------------------------
+
+template< typename T , T v , bool NonZero = ( v != T(0) ) >
+struct integral_nonzero_constant
+{
+  // Declaration of 'static const' causes an unresolved linker symbol in debug
+  // static const T value = v ;
+  enum { value = T(v) };
+  typedef T value_type ;
+  typedef integral_nonzero_constant<T,v> type ;
+  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & ) {}
+};
+
+template< typename T , T zero >
+struct integral_nonzero_constant<T,zero,false>
+{
+  const T value ;
+  typedef T value_type ;
+  typedef integral_nonzero_constant<T,0> type ;
+  KOKKOS_INLINE_FUNCTION integral_nonzero_constant( const T & v ) : value(v) {}
+};
+
+//----------------------------------------------------------------------------
+
+template < class C > struct is_integral_constant : public false_
+{
+  typedef void integral_type ;
+  enum { integral_value = 0 };
+};
+
+template < typename T , T v >
+struct is_integral_constant< integral_constant<T,v> > : public true_
+{
+  typedef T integral_type ;
+  enum { integral_value = v };
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOSTRAITS_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..8334af3a3c88285e4121e71d0c8164a8ad277b17
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewDefault.hpp
@@ -0,0 +1,878 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWDEFAULT_HPP
+#define KOKKOS_VIEWDEFAULT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template<>
+struct ViewAssignment< ViewDefault , ViewDefault , void >
+{
+  typedef ViewDefault Specialize ;
+
+  //------------------------------------
+  /** \brief  Compatible value and shape and LayoutLeft/Right to LayoutStride*/
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename enable_if<(
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::value
+                    ||
+                    ( ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                      ViewTraits<ST,SL,SD,SM> >::assignable_value
+                      &&
+                      ShapeCompatible< typename ViewTraits<DT,DL,DD,DM>::shape_type ,
+                                       typename ViewTraits<ST,SL,SD,SM>::shape_type >::value
+                      &&
+                      is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout,LayoutStride>::value
+                      && (is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout,LayoutLeft>::value ||
+                          is_same< typename ViewTraits<ST,SL,SD,SM>::array_layout,LayoutRight>::value))
+                  )>::type * = 0 )
+  {
+    dst.m_offset_map.assign( src.m_offset_map );
+
+    dst.m_management = src.m_management ;
+
+    dst.m_ptr_on_device = ViewDataManagement< ViewTraits<DT,DL,DD,DM> >::create_handle( src.m_ptr_on_device, src.m_tracker );
+
+    dst.m_tracker = src.m_tracker ;
+
+  }
+
+
+  /** \brief  Assign 1D Strided View to LayoutLeft or LayoutRight if stride[0]==1 */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,LayoutStride,SD,SM,Specialize> & src ,
+                  const typename enable_if<(
+                    (
+                      ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,LayoutStride,SD,SM> >::value
+                      ||
+                      ( ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                      ViewTraits<ST,LayoutStride,SD,SM> >::assignable_value
+                        &&
+                        ShapeCompatible< typename ViewTraits<DT,DL,DD,DM>::shape_type ,
+                                       typename ViewTraits<ST,LayoutStride,SD,SM>::shape_type >::value
+                      )
+                     )
+                     &&
+                      (View<DT,DL,DD,DM,Specialize>::rank==1)
+                     && (is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout,LayoutLeft>::value ||
+                          is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout,LayoutRight>::value)
+                  )>::type * = 0 )
+  {
+    size_t strides[8];
+    src.stride(strides);
+    if(strides[0]!=1) {
+      abort("Trying to assign strided 1D View to LayoutRight or LayoutLeft which is not stride-1");
+    }
+    dst.m_offset_map.assign( src.dimension_0(), 0, 0, 0, 0, 0, 0, 0, 0 );
+
+    dst.m_management = src.m_management ;
+
+    dst.m_ptr_on_device = ViewDataManagement< ViewTraits<DT,DL,DD,DM> >::create_handle( src.m_ptr_on_device, src.m_tracker );
+
+    dst.m_tracker = src.m_tracker ;
+
+  }
+
+  //------------------------------------
+  /** \brief  Deep copy data from compatible value type, layout, rank, and specialization.
+   *          Check the dimensions and allocation lengths at runtime.
+   */
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  inline static
+  void deep_copy( const View<DT,DL,DD,DM,Specialize> & dst ,
+                  const View<ST,SL,SD,SM,Specialize> & src ,
+                  const typename Impl::enable_if<(
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::value_type ,
+                                   typename ViewTraits<ST,SL,SD,SM>::non_const_value_type >::value
+                    &&
+                    Impl::is_same< typename ViewTraits<DT,DL,DD,DM>::array_layout ,
+                                   typename ViewTraits<ST,SL,SD,SM>::array_layout >::value
+                    &&
+                    ( unsigned(ViewTraits<DT,DL,DD,DM>::rank) == unsigned(ViewTraits<ST,SL,SD,SM>::rank) )
+                  )>::type * = 0 )
+  {
+    typedef typename ViewTraits<DT,DL,DD,DM>::memory_space dst_memory_space ;
+    typedef typename ViewTraits<ST,SL,SD,SM>::memory_space src_memory_space ;
+
+    if ( dst.ptr_on_device() != src.ptr_on_device() ) {
+
+      Impl::assert_shapes_are_equal( dst.m_offset_map , src.m_offset_map );
+
+      const size_t nbytes = dst.m_offset_map.scalar_size * dst.m_offset_map.capacity();
+
+      DeepCopy< dst_memory_space , src_memory_space >( dst.ptr_on_device() , src.ptr_on_device() , nbytes );
+    }
+  }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class ExecSpace , class DT , class DL, class DD, class DM, class DS >
+struct ViewDefaultConstruct< ExecSpace , Kokkos::View<DT,DL,DD,DM,DS> , true >
+{
+  Kokkos::View<DT,DL,DD,DM,DS> * const m_ptr ;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( const typename ExecSpace::size_type& i ) const
+    { new(m_ptr+i) Kokkos::View<DT,DL,DD,DM,DS>(); }
+
+  ViewDefaultConstruct( Kokkos::View<DT,DL,DD,DM,DS> * pointer , size_t capacity )
+    : m_ptr( pointer )
+    {
+      Kokkos::RangePolicy< ExecSpace > range( 0 , capacity );
+      parallel_for( range , *this );
+      ExecSpace::fence();
+    }
+};
+
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type
+        >
+struct ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                  , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                  , SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type >
+{
+private:
+
+  typedef View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >  SrcViewType ;
+
+  enum { V0 = Impl::is_same< SubArg0_type , void >::value ? 1 : 0 };
+  enum { V1 = Impl::is_same< SubArg1_type , void >::value ? 1 : 0 };
+  enum { V2 = Impl::is_same< SubArg2_type , void >::value ? 1 : 0 };
+  enum { V3 = Impl::is_same< SubArg3_type , void >::value ? 1 : 0 };
+  enum { V4 = Impl::is_same< SubArg4_type , void >::value ? 1 : 0 };
+  enum { V5 = Impl::is_same< SubArg5_type , void >::value ? 1 : 0 };
+  enum { V6 = Impl::is_same< SubArg6_type , void >::value ? 1 : 0 };
+  enum { V7 = Impl::is_same< SubArg7_type , void >::value ? 1 : 0 };
+
+  // The source view rank must be equal to the input argument rank
+  // Once a void argument is encountered all subsequent arguments must be void.
+  enum { InputRank =
+    Impl::StaticAssert<( SrcViewType::rank ==
+                         ( V0 ? 0 : (
+                           V1 ? 1 : (
+                           V2 ? 2 : (
+                           V3 ? 3 : (
+                           V4 ? 4 : (
+                           V5 ? 5 : (
+                           V6 ? 6 : (
+                           V7 ? 7 : 8 ))))))) ))
+                       &&
+                       ( SrcViewType::rank ==
+                         ( 8 - ( V0 + V1 + V2 + V3 + V4 + V5 + V6 + V7 ) ) )
+    >::value ? SrcViewType::rank : 0 };
+
+  enum { R0 = Impl::ViewOffsetRange< SubArg0_type >::is_range ? 1 : 0 };
+  enum { R1 = Impl::ViewOffsetRange< SubArg1_type >::is_range ? 1 : 0 };
+  enum { R2 = Impl::ViewOffsetRange< SubArg2_type >::is_range ? 1 : 0 };
+  enum { R3 = Impl::ViewOffsetRange< SubArg3_type >::is_range ? 1 : 0 };
+  enum { R4 = Impl::ViewOffsetRange< SubArg4_type >::is_range ? 1 : 0 };
+  enum { R5 = Impl::ViewOffsetRange< SubArg5_type >::is_range ? 1 : 0 };
+  enum { R6 = Impl::ViewOffsetRange< SubArg6_type >::is_range ? 1 : 0 };
+  enum { R7 = Impl::ViewOffsetRange< SubArg7_type >::is_range ? 1 : 0 };
+
+  enum { OutputRank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3)
+                    + unsigned(R4) + unsigned(R5) + unsigned(R6) + unsigned(R7) };
+
+  // Reverse
+  enum { R0_rev = 0 == InputRank ? 0u : (
+                  1 == InputRank ? unsigned(R0) : (
+                  2 == InputRank ? unsigned(R1) : (
+                  3 == InputRank ? unsigned(R2) : (
+                  4 == InputRank ? unsigned(R3) : (
+                  5 == InputRank ? unsigned(R4) : (
+                  6 == InputRank ? unsigned(R5) : (
+                  7 == InputRank ? unsigned(R6) : unsigned(R7) ))))))) };
+
+  typedef typename SrcViewType::array_layout  SrcViewLayout ;
+
+  // Choose array layout, attempting to preserve original layout if at all possible.
+  typedef typename Impl::if_c<
+     ( // Same Layout IF
+       // OutputRank 0
+       ( OutputRank == 0 )
+       ||
+       // OutputRank 1 or 2, InputLayout Left, Interval 0
+       // because single stride one or second index has a stride.
+       ( OutputRank <= 2 && R0 && Impl::is_same<SrcViewLayout,LayoutLeft>::value )
+       ||
+       // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1]
+       // because single stride one or second index has a stride.
+       ( OutputRank <= 2 && R0_rev && Impl::is_same<SrcViewLayout,LayoutRight>::value )
+     ), SrcViewLayout , Kokkos::LayoutStride >::type OutputViewLayout ;
+
+  // Choose data type as a purely dynamic rank array to accomodate a runtime range.
+  typedef typename Impl::if_c< OutputRank == 0 , typename SrcViewType::value_type ,
+          typename Impl::if_c< OutputRank == 1 , typename SrcViewType::value_type *,
+          typename Impl::if_c< OutputRank == 2 , typename SrcViewType::value_type **,
+          typename Impl::if_c< OutputRank == 3 , typename SrcViewType::value_type ***,
+          typename Impl::if_c< OutputRank == 4 , typename SrcViewType::value_type ****,
+          typename Impl::if_c< OutputRank == 5 , typename SrcViewType::value_type *****,
+          typename Impl::if_c< OutputRank == 6 , typename SrcViewType::value_type ******,
+          typename Impl::if_c< OutputRank == 7 , typename SrcViewType::value_type *******,
+                                                 typename SrcViewType::value_type ********
+  >::type >::type >::type >::type >::type >::type >::type >::type  OutputData ;
+
+  // Choose space.
+  // If the source view's template arg1 or arg2 is a space then use it,
+  // otherwise use the source view's execution space.
+
+  typedef typename Impl::if_c< Impl::is_space< SrcArg1Type >::value , SrcArg1Type ,
+          typename Impl::if_c< Impl::is_space< SrcArg2Type >::value , SrcArg2Type , typename SrcViewType::device_type
+  >::type >::type OutputSpace ;
+
+public:
+
+  // If keeping the layout then match non-data type arguments
+  // else keep execution space and memory traits.
+  typedef typename
+    Impl::if_c< Impl::is_same< SrcViewLayout , OutputViewLayout >::value
+              , Kokkos::View< OutputData , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+              , Kokkos::View< OutputData , OutputViewLayout , OutputSpace
+                            , typename SrcViewType::memory_traits
+                            , Impl::ViewDefault >
+              >::type  type ;
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+// Construct subview of a Rank 8 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type , class SubArg5_type , class SubArg6_type , class SubArg7_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    , const SubArg3_type & arg3
+    , const SubArg4_type & arg4
+    , const SubArg5_type & arg5
+    , const SubArg6_type & arg6
+    , const SubArg7_type & arg7
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                           , SubArg4_type , SubArg5_type , SubArg6_type , SubArg7_type >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+    typedef Impl::ViewOffsetRange< SubArg3_type > R3 ;
+    typedef Impl::ViewOffsetRange< SubArg4_type > R4 ;
+    typedef Impl::ViewOffsetRange< SubArg5_type > R5 ;
+    typedef Impl::ViewOffsetRange< SubArg6_type > R6 ;
+    typedef Impl::ViewOffsetRange< SubArg7_type > R7 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , R3::dimension( src.m_offset_map.N3 , arg3 )
+                                 , R4::dimension( src.m_offset_map.N4 , arg4 )
+                                 , R5::dimension( src.m_offset_map.N5 , arg5 )
+                                 , R6::dimension( src.m_offset_map.N6 , arg6 )
+                                 , R7::dimension( src.m_offset_map.N7 , arg7 )
+                                 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        , R3::begin( arg3 )
+                                        , R4::begin( arg4 )
+                                        , R5::begin( arg5 )
+                                        , R6::begin( arg6 )
+                                        , R7::begin( arg7 ) );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 7 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type , class SubArg5_type , class SubArg6_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    , const SubArg3_type & arg3
+    , const SubArg4_type & arg4
+    , const SubArg5_type & arg5
+    , const SubArg6_type & arg6
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                           , SubArg4_type , SubArg5_type , SubArg6_type , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+    typedef Impl::ViewOffsetRange< SubArg3_type > R3 ;
+    typedef Impl::ViewOffsetRange< SubArg4_type > R4 ;
+    typedef Impl::ViewOffsetRange< SubArg5_type > R5 ;
+    typedef Impl::ViewOffsetRange< SubArg6_type > R6 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , R3::dimension( src.m_offset_map.N3 , arg3 )
+                                 , R4::dimension( src.m_offset_map.N4 , arg4 )
+                                 , R5::dimension( src.m_offset_map.N5 , arg5 )
+                                 , R6::dimension( src.m_offset_map.N6 , arg6 )
+                                 , 0
+                                 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        , R3::begin( arg3 )
+                                        , R4::begin( arg4 )
+                                        , R5::begin( arg5 )
+                                        , R6::begin( arg6 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 6 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type , class SubArg5_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    , const SubArg3_type & arg3
+    , const SubArg4_type & arg4
+    , const SubArg5_type & arg5
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                           , SubArg4_type , SubArg5_type , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+    typedef Impl::ViewOffsetRange< SubArg3_type > R3 ;
+    typedef Impl::ViewOffsetRange< SubArg4_type > R4 ;
+    typedef Impl::ViewOffsetRange< SubArg5_type > R5 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , R3::dimension( src.m_offset_map.N3 , arg3 )
+                                 , R4::dimension( src.m_offset_map.N4 , arg4 )
+                                 , R5::dimension( src.m_offset_map.N5 , arg5 )
+                                 , 0
+                                 , 0
+                                 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        , R3::begin( arg3 )
+                                        , R4::begin( arg4 )
+                                        , R5::begin( arg5 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 5 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        , class SubArg4_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    , const SubArg3_type & arg3
+    , const SubArg4_type & arg4
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                           , SubArg4_type , void , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+    typedef Impl::ViewOffsetRange< SubArg3_type > R3 ;
+    typedef Impl::ViewOffsetRange< SubArg4_type > R4 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , R3::dimension( src.m_offset_map.N3 , arg3 )
+                                 , R4::dimension( src.m_offset_map.N4 , arg4 )
+                                 , 0
+                                 , 0
+                                 , 0
+                                 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        , R3::begin( arg3 )
+                                        , R4::begin( arg4 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 4 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type , class SubArg3_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    , const SubArg3_type & arg3
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , SubArg3_type
+                           , void , void , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+    typedef Impl::ViewOffsetRange< SubArg3_type > R3 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , R3::dimension( src.m_offset_map.N3 , arg3 )
+                                 , 0
+                                 , 0
+                                 , 0
+                                 , 0
+                                 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        , R3::begin( arg3 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 3 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type , class SubArg2_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    , const SubArg2_type & arg2
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , SubArg2_type , void , void , void , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+    typedef Impl::ViewOffsetRange< SubArg2_type > R2 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , R2::dimension( src.m_offset_map.N2 , arg2 )
+                                 , 0 , 0 , 0 , 0 , 0);
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        , R2::begin( arg2 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 2 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type , class SubArg1_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    , const SubArg1_type & arg1
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , SubArg1_type , void , void , void , void , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+    typedef Impl::ViewOffsetRange< SubArg1_type > R1 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , R1::dimension( src.m_offset_map.N1 , arg1 )
+                                 , 0 , 0 , 0 , 0 , 0 , 0 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        , R1::begin( arg1 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+// Construct subview of a Rank 1 view
+template< class DstDataType , class DstArg1Type , class DstArg2Type , class DstArg3Type >
+template< class SrcDataType , class SrcArg1Type , class SrcArg2Type , class SrcArg3Type
+        , class SubArg0_type
+        >
+KOKKOS_INLINE_FUNCTION
+View< DstDataType , DstArg1Type , DstArg2Type , DstArg3Type , Impl::ViewDefault >::
+View( const View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault > & src
+    , const SubArg0_type & arg0
+    )
+  : m_ptr_on_device( (typename traits::value_type*) NULL)
+  , m_offset_map()
+  , m_management()
+  , m_tracker()
+{
+  // This constructor can only be used to construct a subview
+  // from the source view.  This type must match the subview type
+  // deduced from the source view and subview arguments.
+
+  typedef Impl::ViewSubview< View< SrcDataType , SrcArg1Type , SrcArg2Type , SrcArg3Type , Impl::ViewDefault >
+                           , SubArg0_type , void , void , void , void , void , void , void >
+    ViewSubviewDeduction ;
+
+  enum { is_a_valid_subview_constructor =
+    Impl::StaticAssert<
+      Impl::is_same< View , typename ViewSubviewDeduction::type >::value
+    >::value
+  };
+
+  if ( is_a_valid_subview_constructor ) {
+
+    typedef Impl::ViewOffsetRange< SubArg0_type > R0 ;
+
+    // 'assign_subview' returns whether the subview offset_map
+    // introduces noncontiguity in the view.
+    const bool introduce_noncontiguity =
+      m_offset_map.assign_subview( src.m_offset_map
+                                 , R0::dimension( src.m_offset_map.N0 , arg0 )
+                                 , 0 , 0 , 0 , 0 , 0 , 0 , 0 );
+
+    if ( m_offset_map.capacity() ) {
+
+      m_management = src.m_management ;
+
+      if ( introduce_noncontiguity ) m_management.set_noncontiguous();
+
+      m_ptr_on_device = src.m_ptr_on_device +
+                        src.m_offset_map( R0::begin( arg0 )
+                                        );
+      m_tracker = src.m_tracker ;
+    }
+  }
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEWDEFAULT_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..61cd75844fff32d6189784af773d008c58d1ce4a
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewOffset.hpp
@@ -0,0 +1,1348 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWOFFSET_HPP
+#define KOKKOS_VIEWOFFSET_HPP
+
+#include <Kokkos_Pair.hpp>
+#include <Kokkos_Layout.hpp>
+#include <impl/Kokkos_Traits.hpp>
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+struct ALL ;
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos { namespace Impl {
+
+template < class ShapeType , class LayoutType , typename Enable = void >
+struct ViewOffset ;
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 >= rank OR 0 == rank_dynamic ) : no padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutLeft
+                 , typename enable_if<( 1 >= ShapeType::rank
+                                        ||
+                                        0 == ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t     size_type ;
+  typedef ShapeType  shape_type ;
+  typedef LayoutLeft array_layout ;
+
+  enum { has_padding = false };
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n )
+    { assign_shape_dimension<R>( *this , n ); }
+
+  // Return whether the subview introduced noncontiguity
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 0 == shape_type::rank &&
+                             Impl::is_same<L,LayoutLeft>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> &
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    {
+      return false ; // did not introduce noncontiguity
+    }
+
+  // This subview must be 1 == rank and 1 == rank_dynamic.
+  // The source dimension #0 must be non-zero and all other dimensions are zero.
+  // Return whether the subview introduced noncontiguity
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 1 == shape_type::rank &&
+                             1 == shape_type::rank_dynamic &&
+                             1 <= S::rank &&
+                             Impl::is_same<L,LayoutLeft>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> &
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    {
+      // n1 .. n7 must be zero
+      shape_type::N0 = n0 ;
+      return false ; // did not introduce noncontiguity
+    }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3
+             , size_t n4 , size_t n5 , size_t n6 , size_t n7
+             , size_t = 0 )
+    { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( 1 == int(ShapeRHS::rank)
+                                    &&
+                                    1 == int(shape_type::rank)
+                                    &&
+                                    1 == int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding() {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  // Stride with [ rank ] value is the total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < shape_type::rank ) { s[1] = shape_type::N0 ; }
+      if ( 1 < shape_type::rank ) { s[2] = s[1] * shape_type::N1 ; }
+      if ( 2 < shape_type::rank ) { s[3] = s[2] * shape_type::N2 ; }
+      if ( 3 < shape_type::rank ) { s[4] = s[3] * shape_type::N3 ; }
+      if ( 4 < shape_type::rank ) { s[5] = s[4] * shape_type::N4 ; }
+      if ( 5 < shape_type::rank ) { s[6] = s[5] * shape_type::N5 ; }
+      if ( 6 < shape_type::rank ) { s[7] = s[6] * shape_type::N6 ; }
+      if ( 7 < shape_type::rank ) { s[8] = s[7] * shape_type::N7 ; }
+    }
+
+  KOKKOS_INLINE_FUNCTION size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_1() const { return shape_type::N0 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_2() const { return shape_type::N0 * shape_type::N1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_3() const { return shape_type::N0 * shape_type::N1 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 ; }
+
+  // rank 1
+  template< typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const & i0 ) const { return i0 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const & i0 , I1 const & i1 ) const
+    { return i0 + shape_type::N0 * i1 ; }
+
+  //rank 3
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0
+                      , I1 const& i1
+                      , I2 const& i2
+                      ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * i2 );
+    }
+
+  //rank 4
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3 ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * i3 ));
+    }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4 ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * i4 )));
+    }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5 ) const
+    {
+      return i0 + shape_type::N0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * (
+             i4 + shape_type::N4 * i5 ))));
+    }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6) const
+  {
+    return i0 + shape_type::N0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7) const
+  {
+    return i0 + shape_type::N0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * (
+           i6 + shape_type::N6 * i7 ))))));
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutLeft AND ( 1 < rank AND 0 < rank_dynamic ) : has padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutLeft
+                 , typename enable_if<( 1 < ShapeType::rank
+                                        &&
+                                        0 < ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t     size_type ;
+  typedef ShapeType  shape_type ;
+  typedef LayoutLeft array_layout ;
+
+  enum { has_padding = true };
+
+  size_type S0 ;
+
+  // This subview must be 2 == rank and 2 == rank_dynamic
+  // due to only having stride #0.
+  // The source dimension #0 must be non-zero for stride-one leading dimension.
+  // At most subsequent dimension can be non-zero.
+  // Return whether the subview introduced noncontiguity.
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 2 == shape_type::rank &&
+                             2 == shape_type::rank_dynamic &&
+                             2 <= S::rank &&
+                             Impl::is_same<L,LayoutLeft>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> & rhs
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    {
+      // N1 = second non-zero dimension
+      // S0 = stride for second non-zero dimension
+      shape_type::N0 = n0 ;
+      shape_type::N1 = 0 ;
+      S0 = 0 ;
+
+      if      (                n1 ) { shape_type::N1 = n1 ; S0 = rhs.stride_1(); }
+      else if ( 2 < S::rank && n2 ) { shape_type::N1 = n2 ; S0 = rhs.stride_2(); }
+      else if ( 3 < S::rank && n3 ) { shape_type::N1 = n3 ; S0 = rhs.stride_3(); }
+      else if ( 4 < S::rank && n4 ) { shape_type::N1 = n4 ; S0 = rhs.stride_4(); }
+      else if ( 5 < S::rank && n5 ) { shape_type::N1 = n5 ; S0 = rhs.stride_5(); }
+      else if ( 6 < S::rank && n6 ) { shape_type::N1 = n6 ; S0 = rhs.stride_6(); }
+      else if ( 7 < S::rank && n7 ) { shape_type::N1 = n7 ; S0 = rhs.stride_7(); }
+
+      // Introduce noncontiguity if change the first dimension
+      // or took a range of a dimension after the second.
+      return ( size_t(shape_type::N0) != size_t(rhs.N0) ) || ( 0 == n1 );
+    }
+
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n )
+    { assign_shape_dimension<R>( *this , n ); }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3
+             , size_t n4 , size_t n5 , size_t n6 , size_t n7
+             , size_t = 0 )
+    { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); S0 = shape_type::N0 ; }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) == 0
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      S0 = shape_type::N0 ; // No padding when dynamic_rank == 0
+    }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) > 0
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      S0 = rhs.S0 ; // possibly padding when dynamic rank > 0
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding()
+    {
+      enum { div   = MEMORY_ALIGNMENT / shape_type::scalar_size };
+      enum { mod   = MEMORY_ALIGNMENT % shape_type::scalar_size };
+      enum { align = 0 == mod ? div : 0 };
+
+      if ( align && MEMORY_ALIGNMENT_THRESHOLD * align < S0 ) {
+
+        const size_type count_mod = S0 % ( div ? div : 1 );
+
+        if ( count_mod ) { S0 += align - count_mod ; }
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const
+    { return size_type(S0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  // Stride with [ rank ] as total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      s[0] = 1 ;
+      if ( 0 < shape_type::rank ) { s[1] = S0 ; }
+      if ( 1 < shape_type::rank ) { s[2] = s[1] * shape_type::N1 ; }
+      if ( 2 < shape_type::rank ) { s[3] = s[2] * shape_type::N2 ; }
+      if ( 3 < shape_type::rank ) { s[4] = s[3] * shape_type::N3 ; }
+      if ( 4 < shape_type::rank ) { s[5] = s[4] * shape_type::N4 ; }
+      if ( 5 < shape_type::rank ) { s[6] = s[5] * shape_type::N5 ; }
+      if ( 6 < shape_type::rank ) { s[7] = s[6] * shape_type::N6 ; }
+      if ( 7 < shape_type::rank ) { s[8] = s[7] * shape_type::N6 ; }
+    }
+
+  KOKKOS_INLINE_FUNCTION size_type stride_0() const { return 1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_1() const { return S0 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_2() const { return S0 * shape_type::N1 ; }
+  KOKKOS_INLINE_FUNCTION size_type stride_3() const { return S0 * shape_type::N1 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const
+    { return S0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 ; }
+
+  // rank 2
+  template < typename I0 , typename I1 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const & i0 , I1 const & i1) const
+    { return i0 + S0 * i1 ; }
+
+  //rank 3
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * i2 );
+    }
+
+  //rank 4
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * i3 ));
+    }
+
+  //rank 5
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * i4 )));
+    }
+
+  //rank 6
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5 ) const
+    {
+      return i0 + S0 * (
+             i1 + shape_type::N1 * (
+             i2 + shape_type::N2 * (
+             i3 + shape_type::N3 * (
+             i4 + shape_type::N4 * i5 ))));
+    }
+
+  //rank 7
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+  {
+    return i0 + S0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * i6 )))));
+  }
+
+  //rank 8
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2, I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+  {
+    return i0 + S0 * (
+           i1 + shape_type::N1 * (
+           i2 + shape_type::N2 * (
+           i3 + shape_type::N3 * (
+           i4 + shape_type::N4 * (
+           i5 + shape_type::N5 * (
+           i6 + shape_type::N6 * i7 ))))));
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 >= rank OR 1 >= rank_dynamic ) : no padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutRight
+                 , typename enable_if<( 1 >= ShapeType::rank
+                                        ||
+                                        1 >= ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t       size_type;
+  typedef ShapeType    shape_type;
+  typedef LayoutRight  array_layout ;
+
+  enum { has_padding = false };
+
+  // This subview must be 1 == rank and 1 == rank_dynamic
+  // The source view's last dimension must be non-zero
+  // Return whether the subview introduced noncontiguity
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 0 == shape_type::rank &&
+                             Impl::is_same<L,LayoutRight>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> &
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    { return false ; }
+
+  // This subview must be 1 == rank and 1 == rank_dynamic
+  // The source view's last dimension must be non-zero
+  // Return whether the subview introduced noncontiguity
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 1 == shape_type::rank &&
+                             1 == shape_type::rank_dynamic &&
+                             1 <= S::rank &&
+                             Impl::is_same<L,LayoutRight>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> &
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    {
+      shape_type::N0 = S::rank == 1 ? n0 : (
+                       S::rank == 2 ? n1 : (
+                       S::rank == 3 ? n2 : (
+                       S::rank == 4 ? n3 : (
+                       S::rank == 5 ? n4 : (
+                       S::rank == 6 ? n5 : (
+                       S::rank == 7 ? n6 : n7 ))))));
+      // should have n0 .. n_(rank-2) equal zero
+      return false ;
+    }
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n )
+    { assign_shape_dimension<R>( *this , n ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3
+             , size_t n4 , size_t n5 , size_t n6 , size_t n7
+             , size_t = 0 )
+    { shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutLeft > & rhs
+             , typename enable_if<( 1 == int(ShapeRHS::rank)
+                                    &&
+                                    1 == int(shape_type::rank)
+                                    &&
+                                    1 == int(shape_type::rank_dynamic)
+                                  )>::type * = 0 )
+    { shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding() {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  size_type stride_R() const
+    {
+      return size_type(shape_type::N1) * shape_type::N2 * shape_type::N3 *
+             shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ;
+    };
+
+  // Stride with [rank] as total length
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < shape_type::rank ) { s[7] = n ; n *= shape_type::N7 ; }
+      if ( 6 < shape_type::rank ) { s[6] = n ; n *= shape_type::N6 ; }
+      if ( 5 < shape_type::rank ) { s[5] = n ; n *= shape_type::N5 ; }
+      if ( 4 < shape_type::rank ) { s[4] = n ; n *= shape_type::N4 ; }
+      if ( 3 < shape_type::rank ) { s[3] = n ; n *= shape_type::N3 ; }
+      if ( 2 < shape_type::rank ) { s[2] = n ; n *= shape_type::N2 ; }
+      if ( 1 < shape_type::rank ) { s[1] = n ; n *= shape_type::N1 ; }
+      if ( 0 < shape_type::rank ) { s[0] = n ; }
+      s[shape_type::rank] = n * shape_type::N0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const { return shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const { return shape_type::N7 * shape_type::N6 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_3() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_2() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_1() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_0() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 * shape_type::N1 ; }
+
+  // rank 1
+  template <typename I0>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0) const
+    {
+      return i0 ;
+    }
+
+  // rank 2
+  template <typename I0, typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1 ) const
+    {
+      return i1 + shape_type::N1 * i0 ;
+    }
+
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i2 + shape_type::N2 * (
+             i1 + shape_type::N1 * ( i0 ));
+    }
+
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const
+    {
+      return i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * (
+             i1 + shape_type::N1 * ( i0 )));
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const
+    {
+      return i4 + shape_type::N4 * (
+             i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * (
+             i1 + shape_type::N1 * ( i0 ))));
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const
+  {
+    return i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * (
+           i1 + shape_type::N1 * ( i0 )))));
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+  {
+    return i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * (
+           i1 + shape_type::N1 * ( i0 ))))));
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+  {
+    return i7 + shape_type::N7 * (
+           i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * (
+           i1 + shape_type::N1 * ( i0 )))))));
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutRight AND ( 1 < rank AND 1 < rank_dynamic ) : has padding / striding
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutRight
+                 , typename enable_if<( 1 < ShapeType::rank
+                                        &&
+                                        1 < ShapeType::rank_dynamic
+                                      )>::type >
+  : public ShapeType
+{
+  typedef size_t       size_type;
+  typedef ShapeType    shape_type;
+  typedef LayoutRight  array_layout ;
+
+  enum { has_padding = true };
+
+  size_type SR ;
+
+  // This subview must be 2 == rank and 2 == rank_dynamic
+  // due to only having stride #(rank-1).
+  // The source dimension #(rank-1) must be non-zero for stride-one leading dimension.
+  // At most one prior dimension can be non-zero.
+  // Return whether the subview introduced noncontiguity.
+  template< class S , class L >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::enable_if<( 2 == shape_type::rank &&
+                             2 == shape_type::rank_dynamic &&
+                             2 <= S::rank &&
+                             Impl::is_same<L,LayoutRight>::value
+                           ), bool >::type
+  assign_subview( const ViewOffset<S,L,void> & rhs
+                , const size_t n0
+                , const size_t n1
+                , const size_t n2
+                , const size_t n3
+                , const size_t n4
+                , const size_t n5
+                , const size_t n6
+                , const size_t n7
+                )
+    {
+      const size_type nR = S::rank == 2 ? n1 : (
+                           S::rank == 3 ? n2 : (
+                           S::rank == 4 ? n3 : (
+                           S::rank == 5 ? n4 : (
+                           S::rank == 6 ? n5 : (
+                           S::rank == 7 ? n6 : n7 )))));
+
+      // N0 = first non-zero-dimension
+      // N1 = last non-zero dimension
+      // SR = stride for second non-zero dimension
+      shape_type::N0 = 0 ;
+      shape_type::N1 = nR ;
+      SR = 0 ;
+
+      if      (                n0 ) { shape_type::N0 = n0 ; SR = rhs.stride_0(); }
+      else if ( 2 < S::rank && n1 ) { shape_type::N0 = n1 ; SR = rhs.stride_1(); }
+      else if ( 3 < S::rank && n2 ) { shape_type::N0 = n2 ; SR = rhs.stride_2(); }
+      else if ( 4 < S::rank && n3 ) { shape_type::N0 = n3 ; SR = rhs.stride_3(); }
+      else if ( 5 < S::rank && n4 ) { shape_type::N0 = n4 ; SR = rhs.stride_4(); }
+      else if ( 6 < S::rank && n5 ) { shape_type::N0 = n5 ; SR = rhs.stride_5(); }
+      else if ( 7 < S::rank && n6 ) { shape_type::N0 = n6 ; SR = rhs.stride_6(); }
+
+      // Introduce noncontiguous if change the last dimension
+      // or take a range of a dimension other than the second-to-last dimension.
+
+      return 2 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N1) || 0 == n0 ) : (
+             3 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N2) || 0 == n1 ) : (
+             4 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N3) || 0 == n2 ) : (
+             5 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N4) || 0 == n3 ) : (
+             6 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N5) || 0 == n4 ) : (
+             7 == S::rank ? ( size_t(shape_type::N1) != size_t(rhs.N6) || 0 == n5 ) : (
+                            ( size_t(shape_type::N1) != size_t(rhs.N7) || 0 == n6 ) ))))));
+    }
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n )
+    { assign_shape_dimension<R>( *this , n ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n0 , size_t n1 , size_t n2 , size_t n3
+             , size_t n4 , size_t n5 , size_t n6 , size_t n7
+             , size_t = 0 )
+    {
+      shape_type::assign( *this , n0, n1, n2, n3, n4, n5, n6, n7 );
+      SR = size_type(shape_type::N1) * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ;
+    }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= 1
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      SR = shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ;
+    }
+
+  template< class ShapeRHS >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset< ShapeRHS , LayoutRight > & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) <= int(shape_type::rank_dynamic)
+                                    &&
+                                    int(ShapeRHS::rank_dynamic) > 1
+                                  )>::type * = 0 )
+    {
+      shape_type::assign( *this , rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+      SR = rhs.SR ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding()
+    {
+      enum { div   = MEMORY_ALIGNMENT / shape_type::scalar_size };
+      enum { mod   = MEMORY_ALIGNMENT % shape_type::scalar_size };
+      enum { align = 0 == mod ? div : 0 };
+
+      if ( align && MEMORY_ALIGNMENT_THRESHOLD * align < SR ) {
+
+        const size_type count_mod = SR % ( div ? div : 1 );
+
+        if ( count_mod ) { SR += align - count_mod ; }
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return size_type(shape_type::N0) * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const { return shape_type::N0 * SR ; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    {
+      size_type n = 1 ;
+      if ( 7 < shape_type::rank ) { s[7] = n ; n *= shape_type::N7 ; }
+      if ( 6 < shape_type::rank ) { s[6] = n ; n *= shape_type::N6 ; }
+      if ( 5 < shape_type::rank ) { s[5] = n ; n *= shape_type::N5 ; }
+      if ( 4 < shape_type::rank ) { s[4] = n ; n *= shape_type::N4 ; }
+      if ( 3 < shape_type::rank ) { s[3] = n ; n *= shape_type::N3 ; }
+      if ( 2 < shape_type::rank ) { s[2] = n ; n *= shape_type::N2 ; }
+      if ( 1 < shape_type::rank ) { s[1] = n ; n *= shape_type::N1 ; }
+      if ( 0 < shape_type::rank ) { s[0] = SR ; }
+      s[shape_type::rank] = SR * shape_type::N0 ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const { return 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const { return shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const { return shape_type::N7 * shape_type::N6 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_3() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_2() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_1() const { return shape_type::N7 * shape_type::N6 * shape_type::N5 * shape_type::N4 * shape_type::N3 * shape_type::N2 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_0() const { return SR ; }
+
+  // rank 2
+  template <typename I0, typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1 ) const
+    {
+      return i1 + i0 * SR ;
+    }
+
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i2 + shape_type::N2 * ( i1 ) +
+             i0 * SR ;
+    }
+
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const
+    {
+      return i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * ( i1 )) +
+             i0 * SR ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const
+    {
+      return i4 + shape_type::N4 * (
+             i3 + shape_type::N3 * (
+             i2 + shape_type::N2 * ( i1 ))) +
+             i0 * SR ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const
+  {
+    return i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * ( i1 )))) +
+           i0 * SR ;
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+  {
+    return i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * ( i1 ))))) +
+           i0 * SR ;
+  }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  size_type operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+  {
+    return i7 + shape_type::N7 * (
+           i6 + shape_type::N6 * (
+           i5 + shape_type::N5 * (
+           i4 + shape_type::N4 * (
+           i3 + shape_type::N3 * (
+           i2 + shape_type::N2 * ( i1 )))))) +
+           i0 * SR ;
+  }
+};
+
+//----------------------------------------------------------------------------
+// LayoutStride : 
+template < class ShapeType >
+struct ViewOffset< ShapeType , LayoutStride
+                 , typename enable_if<( 0 < ShapeType::rank )>::type >
+  : public ShapeType
+{
+  typedef size_t        size_type;
+  typedef ShapeType     shape_type;
+  typedef LayoutStride  array_layout ;
+
+  size_type S[ shape_type::rank + 1 ];
+
+  template< class SType , class L >
+  KOKKOS_INLINE_FUNCTION
+  bool assign_subview( const ViewOffset<SType,L,void> & rhs
+                     , const size_type n0
+                     , const size_type n1
+                     , const size_type n2
+                     , const size_type n3
+                     , const size_type n4
+                     , const size_type n5
+                     , const size_type n6
+                     , const size_type n7
+                     )
+    {
+      shape_type::assign( *this, 0,0,0,0, 0,0,0,0 );
+
+      for ( int i = 0 ; i < int(shape_type::rank+1) ; ++i ) { S[i] = 0 ; }
+
+      // preconditions:
+      //  shape_type::rank <= rhs.rank
+      //  shape_type::rank == count of nonzero( rhs_dim[i] )
+      size_type dim[8] = { n0 , n1 , n2 , n3 , n4 , n5 , n6 , n7 };
+      size_type str[ SType::rank + 1 ];
+
+      rhs.stride( str );
+
+      // contract the zero-dimensions
+      int r = 0 ;
+      for ( int i = 0 ; i < int(SType::rank) ; ++i ) {
+        if ( 0 != dim[i] ) {
+          dim[r] = dim[i] ;
+          str[r] = str[i] ;
+          ++r ;
+        }
+      }
+
+      if ( int(shape_type::rank) == r ) {
+        // The shape is non-zero
+        for ( int i = 0 ; i < int(shape_type::rank) ; ++i ) {
+          const size_type cap = dim[i] * ( S[i] = str[i] );
+          if ( S[ shape_type::rank ] < cap ) S[ shape_type::rank ] = cap ;
+        }
+        // set the contracted nonzero dimensions
+        shape_type::assign( *this, dim[0], dim[1], dim[2], dim[3], dim[4], dim[5], dim[6], dim[7] );
+      }
+
+      return true ; // definitely noncontiguous
+    }
+
+  template< unsigned R >
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n )
+    { assign_shape_dimension<R>( *this , n ); }
+
+  template< class ShapeRHS , class Layout >
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset<ShapeRHS,Layout> & rhs
+             , typename enable_if<( int(ShapeRHS::rank) == int(shape_type::rank) )>::type * = 0 )
+    {
+      rhs.stride(S);
+      shape_type::assign( *this, rhs.N0, rhs.N1, rhs.N2, rhs.N3, rhs.N4, rhs.N5, rhs.N6, rhs.N7 );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( const LayoutStride & layout )
+  {
+    size_type max = 0 ;
+    for ( int i = 0 ; i < shape_type::rank ; ++i ) {
+      S[i] = layout.stride[i] ;
+      const size_type m = layout.dimension[i] * S[i] ;
+      if ( max < m ) { max = m ; }
+    }
+    S[ shape_type::rank ] = max ;
+    shape_type::assign( *this, layout.dimension[0], layout.dimension[1],
+                               layout.dimension[2], layout.dimension[3],
+                               layout.dimension[4], layout.dimension[5],
+                               layout.dimension[6], layout.dimension[7] );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t s0 , size_t s1 , size_t s2 , size_t s3
+             , size_t s4 , size_t s5 , size_t s6 , size_t s7
+             , size_t s8 )
+    {
+      const size_t str[9] = { s0, s1, s2, s3, s4, s5, s6, s7, s8 };
+
+      // Last argument is the total length.
+      // Total length must be non-zero.
+      // All strides must be non-zero and less than total length.
+      bool ok = 0 < str[ shape_type::rank ] ;
+
+      for ( int i = 0 ; ( i < shape_type::rank ) &&
+                        ( ok = 0 < str[i] && str[i] < str[ shape_type::rank ] ); ++i );
+
+      if ( ok ) {
+        size_t dim[8] = { 1,1,1,1,1,1,1,1 }; 
+        int iorder[9] = { 0,0,0,0,0,0,0,0,0 }; 
+
+        // Ordering of strides smallest to largest.
+        for ( int i = 1 ; i < shape_type::rank ; ++i ) {
+          int j = i ;
+          for ( ; 0 < j && str[i] < str[ iorder[j-1] ] ; --j ) {
+            iorder[j] = iorder[j-1] ;
+          }
+          iorder[j] = i ;
+        }
+
+        // Last argument is the total length.
+        iorder[ shape_type::rank ] = shape_type::rank ;
+
+        // Determine dimension associated with each stride.
+        // Guarantees non-overlap by truncating dimension
+        // if ( 0 != str[ iorder[i+1] ] % str[ iorder[i] ] )
+        for ( int i = 0 ; i < shape_type::rank ; ++i ) {
+          dim[ iorder[i] ] = str[ iorder[i+1] ] / str[ iorder[i] ] ;
+        }
+
+        // Assign dimensions and strides:
+        shape_type::assign( *this, dim[0], dim[1], dim[2], dim[3], dim[4], dim[5], dim[6], dim[7] );
+        for ( int i = 0 ; i <= shape_type::rank ; ++i ) { S[i] = str[i] ; }
+      }
+      else {
+        shape_type::assign(*this,0,0,0,0,0,0,0,0);
+        for ( int i = 0 ; i <= shape_type::rank ; ++i ) { S[i] = 0 ; }
+      }
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding() {}
+
+  KOKKOS_INLINE_FUNCTION
+  size_type cardinality() const
+    { return shape_type::N0 * shape_type::N1 * shape_type::N2 * shape_type::N3 * shape_type::N4 * shape_type::N5 * shape_type::N6 * shape_type::N7 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const { return S[ shape_type::rank ]; }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    { for ( int i = 0 ; i <= shape_type::rank ; ++i ) { s[i] = S[i] ; } }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_0() const { return S[0] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_1() const { return S[1] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_2() const { return S[2] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_3() const { return S[3] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_4() const { return S[4] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_5() const { return S[5] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_6() const { return S[6] ; }
+
+  KOKKOS_INLINE_FUNCTION
+  size_type stride_7() const { return S[7] ; }
+
+  // rank 1
+  template <typename I0 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==1),size_type>::type
+    operator()( I0 const& i0) const
+    {
+      return i0 * S[0] ;
+    }
+
+  // rank 2
+  template <typename I0, typename I1>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==2),size_type>::type
+    operator()( I0 const& i0, I1 const& i1 ) const
+    {
+      return i0 * S[0] + i1 * S[1] ;
+    }
+
+  template <typename I0, typename I1, typename I2>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==3),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] ;
+    }
+
+  template <typename I0, typename I1, typename I2, typename I3>
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==4),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==5),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==6),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==7),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] + i6 * S[6] ;
+    }
+
+  template < typename I0, typename I1, typename I2, typename I3
+            ,typename I4, typename I5, typename I6, typename I7 >
+  KOKKOS_FORCEINLINE_FUNCTION
+  typename std::enable_if< (std::is_integral<I0>::value) && (shape_type::rank==8),size_type>::type
+    operator()( I0 const& i0, I1 const& i1, I2 const& i2 , I3 const& i3, I4 const& i4, I5 const& i5, I6 const& i6, I7 const& i7 ) const
+    {
+      return i0 * S[0] + i1 * S[1] + i2 * S[2] + i3 * S[3] + i4 * S[4] + i5 * S[5] + i6 * S[6] + i7 * S[7] ;
+    }
+};
+
+//----------------------------------------------------------------------------
+
+template< class T >
+struct ViewOffsetRange {
+
+  enum { OK_integral_type = Impl::StaticAssert< Impl::is_integral<T>::value >::value };
+
+  enum { is_range = false };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const , T const & ) { return 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( T const & i ) { return size_t(i) ; }
+};
+
+template<>
+struct ViewOffsetRange<void> {
+  enum { is_range = false };
+};
+
+template<>
+struct ViewOffsetRange< Kokkos::ALL > {
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , ALL const & ) { return n ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( ALL const & ) { return 0 ; }
+};
+
+template< typename iType >
+struct ViewOffsetRange< std::pair<iType,iType> > {
+
+  enum { OK_integral_type = Impl::StaticAssert< Impl::is_integral<iType>::value >::value };
+
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , std::pair<iType,iType> const & r )
+    { return ( size_t(r.first) < size_t(r.second) && size_t(r.second) <= n ) ? size_t(r.second) - size_t(r.first) : 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( std::pair<iType,iType> const & r ) { return size_t(r.first) ; }
+};
+
+template< typename iType >
+struct ViewOffsetRange< Kokkos::pair<iType,iType> > {
+
+  enum { OK_integral_type = Impl::StaticAssert< Impl::is_integral<iType>::value >::value };
+
+  enum { is_range = true };
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t dimension( size_t const n , Kokkos::pair<iType,iType> const & r )
+    { return ( size_t(r.first) < size_t(r.second) && size_t(r.second) <= n ) ? size_t(r.second) - size_t(r.first) : 0 ; }
+
+  KOKKOS_INLINE_FUNCTION static
+  size_t begin( Kokkos::pair<iType,iType> const & r ) { return size_t(r.first) ; }
+};
+
+}} // namespace Kokkos::Impl
+
+#endif //KOKKOS_VIEWOFFSET_HPP
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..006b35923d0adb9103979ee2873ea53f2254bdc3
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewSupport.hpp
@@ -0,0 +1,518 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWSUPPORT_HPP
+#define KOKKOS_VIEWSUPPORT_HPP
+
+#include <Kokkos_ExecPolicy.hpp>
+#include <impl/Kokkos_Shape.hpp>
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+/** \brief  Evaluate if LHS = RHS view assignment is allowed. */
+template< class ViewLHS , class ViewRHS >
+struct ViewAssignable
+{
+  // Same memory space.
+  // Same value type.
+  // Compatible 'const' qualifier
+  // Cannot assign managed = unmannaged
+  enum { assignable_value =
+    ( is_same< typename ViewLHS::value_type ,
+               typename ViewRHS::value_type >::value
+      ||
+      is_same< typename ViewLHS::value_type ,
+               typename ViewRHS::const_value_type >::value )
+    &&
+    is_same< typename ViewLHS::memory_space ,
+             typename ViewRHS::memory_space >::value
+    &&
+    ( ! ( ViewLHS::is_managed && ! ViewRHS::is_managed ) )
+  };
+
+  enum { assignable_shape =
+    // Compatible shape and matching layout:
+    ( ShapeCompatible< typename ViewLHS::shape_type ,
+                       typename ViewRHS::shape_type >::value
+      &&
+      is_same< typename ViewLHS::array_layout ,
+               typename ViewRHS::array_layout >::value )
+    ||
+    // Matching layout, same rank, and LHS dynamic rank
+    ( is_same< typename ViewLHS::array_layout ,
+               typename ViewRHS::array_layout >::value
+      &&
+      int(ViewLHS::rank) == int(ViewRHS::rank)
+      &&
+      int(ViewLHS::rank) == int(ViewLHS::rank_dynamic) )
+    ||
+    // Both rank-0, any shape and layout
+    ( int(ViewLHS::rank) == 0 && int(ViewRHS::rank) == 0 )
+    ||
+    // Both rank-1 and LHS is dynamic rank-1, any shape and layout
+    ( int(ViewLHS::rank) == 1 && int(ViewRHS::rank) == 1 &&
+      int(ViewLHS::rank_dynamic) == 1 )
+    };
+
+  enum { value = assignable_value && assignable_shape };
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class ExecSpace , class Type , bool Initialize >
+struct ViewDefaultConstruct
+{ ViewDefaultConstruct( Type * , size_t ) {} };
+
+
+/** \brief  ViewDataHandle provides the type of the 'data handle' which the view
+ *          uses to access data with the [] operator. It also provides
+ *          an allocate function and a function to extract a raw ptr from the
+ *          data handle. ViewDataHandle also defines an enum ReferenceAble which
+ *          specifies whether references/pointers to elements can be taken and a
+ *          'return_type' which is what the view operators will give back.
+ *          Specialisation of this object allows three things depending
+ *          on ViewTraits and compiler options:
+ *          (i)   Use special allocator (e.g. huge pages/small pages and pinned memory)
+ *          (ii)  Use special data handle type (e.g. add Cuda Texture Object)
+ *          (iii) Use special access intrinsics (e.g. texture fetch and non-caching loads)
+ */
+template< class StaticViewTraits , class Enable = void >
+struct ViewDataHandle {
+
+  enum { ReturnTypeIsReference = true };
+
+  typedef typename StaticViewTraits::value_type * handle_type;
+  typedef typename StaticViewTraits::value_type & return_type;
+
+  KOKKOS_INLINE_FUNCTION
+  static handle_type create_handle( typename StaticViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
+  {
+    return handle_type(arg_data_ptr);
+  }
+};
+
+template< class StaticViewTraits , class Enable = void >
+class ViewDataManagement : public ViewDataHandle< StaticViewTraits > {
+private:
+
+  template< class , class > friend class ViewDataManagement ;
+
+  struct PotentiallyManaged  {};
+  struct StaticallyUnmanaged {};
+
+  /* Statically unmanaged if traits or not executing in host-accessible memory space */
+  typedef typename
+    Impl::if_c< StaticViewTraits::is_managed &&
+                Impl::is_same< Kokkos::HostSpace
+                             , Kokkos::Impl::ActiveExecutionMemorySpace >::value
+              , PotentiallyManaged
+              , StaticallyUnmanaged
+              >::type StaticManagementTag ;
+
+  enum { Unmanaged     = 0x01
+       , Noncontiguous = 0x02
+       };
+
+  enum { DefaultTraits = Impl::is_same< StaticManagementTag , StaticallyUnmanaged >::value ? Unmanaged : 0 };
+
+  unsigned m_traits ; ///< Runtime traits
+
+
+  template< class T >
+  inline static
+  unsigned assign( const ViewDataManagement<T> & rhs , const PotentiallyManaged & )
+    { return rhs.m_traits | ( rhs.is_managed() && Kokkos::HostSpace::in_parallel() ? unsigned(Unmanaged) : 0u ); }
+
+  template< class T >
+  KOKKOS_INLINE_FUNCTION static
+  unsigned assign( const ViewDataManagement<T> & rhs , const StaticallyUnmanaged & )
+    { return rhs.m_traits | Unmanaged ; }
+
+public:
+
+  typedef typename ViewDataHandle< StaticViewTraits >::handle_type handle_type;
+
+  KOKKOS_INLINE_FUNCTION
+  ViewDataManagement() : m_traits( DefaultTraits ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ViewDataManagement( const ViewDataManagement & rhs )
+    : m_traits( assign( rhs , StaticManagementTag() ) ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  ViewDataManagement & operator = ( const ViewDataManagement & rhs )
+    { m_traits = assign( rhs , StaticManagementTag() ); return *this ; }
+
+  template< class SVT >
+  KOKKOS_INLINE_FUNCTION
+  ViewDataManagement( const ViewDataManagement<SVT> & rhs )
+    : m_traits( assign( rhs , StaticManagementTag() ) ) {}
+
+  template< class SVT >
+  KOKKOS_INLINE_FUNCTION
+  ViewDataManagement & operator = ( const ViewDataManagement<SVT> & rhs )
+    { m_traits = assign( rhs , StaticManagementTag() ); return *this ; }
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_managed() const { return ! ( m_traits & Unmanaged ); }
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_contiguous() const { return ! ( m_traits & Noncontiguous ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_unmanaged() { m_traits |= Unmanaged ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void set_noncontiguous() { m_traits |= Noncontiguous ; }
+
+  template< bool Initialize >
+  static
+  handle_type allocate(  const std::string & label
+                       , const Impl::ViewOffset< typename StaticViewTraits::shape_type, typename StaticViewTraits::array_layout > & offset_map
+                       , AllocationTracker & tracker
+               )
+    {
+      typedef typename StaticViewTraits::execution_space  execution_space ;
+      typedef typename StaticViewTraits::memory_space     memory_space ;
+      typedef typename StaticViewTraits::value_type       value_type ;
+
+      const size_t count = offset_map.capacity();
+
+      tracker = memory_space::allocate_and_track( label, sizeof(value_type) * count );
+
+      value_type * ptr = reinterpret_cast<value_type *>(tracker.alloc_ptr());
+
+      // Default construct within the view's execution space.
+      (void) ViewDefaultConstruct< execution_space , value_type , Initialize >( ptr , count );
+
+      return ViewDataHandle< StaticViewTraits >::create_handle(ptr, tracker);
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class OutputView , class InputView  , unsigned Rank = OutputView::Rank >
+struct ViewRemap
+{
+  typedef typename OutputView::size_type   size_type ;
+
+  const OutputView output ;
+  const InputView  input ;
+  const size_type n0 ;
+  const size_type n1 ;
+  const size_type n2 ;
+  const size_type n3 ;
+  const size_type n4 ;
+  const size_type n5 ;
+  const size_type n6 ;
+  const size_type n7 ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+    : output( arg_out ), input( arg_in )
+    , n0( std::min( (size_t)arg_out.dimension_0() , (size_t)arg_in.dimension_0() ) )
+    , n1( std::min( (size_t)arg_out.dimension_1() , (size_t)arg_in.dimension_1() ) )
+    , n2( std::min( (size_t)arg_out.dimension_2() , (size_t)arg_in.dimension_2() ) )
+    , n3( std::min( (size_t)arg_out.dimension_3() , (size_t)arg_in.dimension_3() ) )
+    , n4( std::min( (size_t)arg_out.dimension_4() , (size_t)arg_in.dimension_4() ) )
+    , n5( std::min( (size_t)arg_out.dimension_5() , (size_t)arg_in.dimension_5() ) )
+    , n6( std::min( (size_t)arg_out.dimension_6() , (size_t)arg_in.dimension_6() ) )
+    , n7( std::min( (size_t)arg_out.dimension_7() , (size_t)arg_in.dimension_7() ) )
+    {
+      typedef typename OutputView::execution_space execution_space ;
+      Kokkos::RangePolicy< execution_space > range( 0 , n0 );
+      parallel_for( range , *this );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i0 ) const
+  {
+    for ( size_type i1 = 0 ; i1 < n1 ; ++i1 ) {
+    for ( size_type i2 = 0 ; i2 < n2 ; ++i2 ) {
+    for ( size_type i3 = 0 ; i3 < n3 ; ++i3 ) {
+    for ( size_type i4 = 0 ; i4 < n4 ; ++i4 ) {
+    for ( size_type i5 = 0 ; i5 < n5 ; ++i5 ) {
+    for ( size_type i6 = 0 ; i6 < n6 ; ++i6 ) {
+    for ( size_type i7 = 0 ; i7 < n7 ; ++i7 ) {
+      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input.at(i0,i1,i2,i3,i4,i5,i6,i7);
+    }}}}}}}
+  }
+};
+
+template< class OutputView , class InputView  >
+struct ViewRemap< OutputView ,  InputView , 0 >
+{
+  typedef typename OutputView::value_type   value_type ;
+  typedef typename OutputView::memory_space dst_space ;
+  typedef typename InputView ::memory_space src_space ;
+
+  ViewRemap( const OutputView & arg_out , const InputView & arg_in )
+  {
+    DeepCopy< dst_space , src_space >( arg_out.ptr_on_device() ,
+                                       arg_in.ptr_on_device() ,
+                                       sizeof(value_type) );
+  }
+};
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace , class Type >
+struct ViewDefaultConstruct< ExecSpace , Type , true >
+{
+  Type * const m_ptr ;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( const typename ExecSpace::size_type& i ) const
+    { m_ptr[i] = Type(); }
+
+  ViewDefaultConstruct( Type * pointer , size_t capacity )
+    : m_ptr( pointer )
+    {
+      Kokkos::RangePolicy< ExecSpace > range( 0 , capacity );
+      parallel_for( range , *this );
+      ExecSpace::fence();
+    }
+};
+
+template< class OutputView , unsigned Rank = OutputView::Rank ,
+          class Enabled = void >
+struct ViewFill
+{
+  typedef typename OutputView::const_value_type  const_value_type ;
+  typedef typename OutputView::size_type         size_type ;
+
+  const OutputView output ;
+  const_value_type input ;
+
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+    : output( arg_out ), input( arg_in )
+    {
+      typedef typename OutputView::execution_space execution_space ;
+      Kokkos::RangePolicy< execution_space > range( 0 , output.dimension_0() );
+      parallel_for( range , *this );
+      execution_space::fence();
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type i0 ) const
+  {
+    for ( size_type i1 = 0 ; i1 < output.dimension_1() ; ++i1 ) {
+    for ( size_type i2 = 0 ; i2 < output.dimension_2() ; ++i2 ) {
+    for ( size_type i3 = 0 ; i3 < output.dimension_3() ; ++i3 ) {
+    for ( size_type i4 = 0 ; i4 < output.dimension_4() ; ++i4 ) {
+    for ( size_type i5 = 0 ; i5 < output.dimension_5() ; ++i5 ) {
+    for ( size_type i6 = 0 ; i6 < output.dimension_6() ; ++i6 ) {
+    for ( size_type i7 = 0 ; i7 < output.dimension_7() ; ++i7 ) {
+      output.at(i0,i1,i2,i3,i4,i5,i6,i7) = input ;
+    }}}}}}}
+  }
+};
+
+template< class OutputView >
+struct ViewFill< OutputView , 0 >
+{
+  typedef typename OutputView::const_value_type  const_value_type ;
+  typedef typename OutputView::memory_space      dst_space ;
+
+  ViewFill( const OutputView & arg_out , const_value_type & arg_in )
+  {
+    DeepCopy< dst_space , dst_space >( arg_out.ptr_on_device() , & arg_in ,
+                                       sizeof(const_value_type) );
+  }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+
+struct ViewAllocateWithoutInitializing {
+
+  const std::string label ;
+
+  ViewAllocateWithoutInitializing() : label() {}
+  ViewAllocateWithoutInitializing( const std::string & arg_label ) : label( arg_label ) {}
+  ViewAllocateWithoutInitializing( const char * const  arg_label ) : label( arg_label ) {}
+};
+
+struct ViewAllocate {
+
+  const std::string  label ;
+
+  ViewAllocate() : label() {}
+  ViewAllocate( const std::string & arg_label ) : label( arg_label ) {}
+  ViewAllocate( const char * const  arg_label ) : label( arg_label ) {}
+};
+
+}
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Traits , class AllocationProperties , class Enable = void >
+struct ViewAllocProp : public Kokkos::Impl::false_type {};
+
+template< class Traits >
+struct ViewAllocProp< Traits , Kokkos::ViewAllocate
+  , typename Kokkos::Impl::enable_if<(
+      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
+    )>::type >
+  : public Kokkos::Impl::true_type
+{
+  typedef size_t               size_type ;
+  typedef const ViewAllocate & property_type ;
+
+  enum { Initialize = true };
+  enum { AllowPadding = false };
+
+  inline
+  static const std::string & label( property_type p ) { return p.label ; }
+};
+
+template< class Traits >
+struct ViewAllocProp< Traits , std::string
+  , typename Kokkos::Impl::enable_if<(
+      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
+    )>::type >
+  : public Kokkos::Impl::true_type
+{
+  typedef size_t              size_type ;
+  typedef const std::string & property_type ;
+
+  enum { Initialize = true };
+  enum { AllowPadding = false };
+
+  inline
+  static const std::string & label( property_type s ) { return s ; }
+};
+
+template< class Traits , unsigned N >
+struct ViewAllocProp< Traits , char[N]
+  , typename Kokkos::Impl::enable_if<(
+      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
+    )>::type >
+  : public Kokkos::Impl::true_type
+{
+private:
+  typedef char label_type[N] ;
+public:
+
+  typedef size_t             size_type ;
+  typedef const label_type & property_type ;
+
+  enum { Initialize = true };
+  enum { AllowPadding = false };
+
+  inline
+  static std::string label( property_type s ) { return std::string(s) ; }
+};
+
+template< class Traits >
+struct ViewAllocProp< Traits , Kokkos::ViewAllocateWithoutInitializing
+  , typename Kokkos::Impl::enable_if<(
+      Traits::is_managed && ! Kokkos::Impl::is_const< typename Traits::value_type >::value
+    )>::type >
+  : public Kokkos::Impl::true_type
+{
+  typedef size_t size_type ;
+  typedef const Kokkos::ViewAllocateWithoutInitializing & property_type ;
+
+  enum { Initialize = false };
+  enum { AllowPadding = false };
+
+  inline
+  static std::string label( property_type s ) { return s.label ; }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class Traits , class PointerProperties , class Enable = void >
+struct ViewRawPointerProp : public Kokkos::Impl::false_type {};
+
+template< class Traits , typename T >
+struct ViewRawPointerProp< Traits , T ,
+  typename Kokkos::Impl::enable_if<(
+    Impl::is_same< T , typename Traits::value_type >::value ||
+    Impl::is_same< T , typename Traits::non_const_value_type >::value
+  )>::type >
+  : public Kokkos::Impl::true_type
+{
+  typedef size_t size_type ;
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEWSUPPORT_HPP */
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..91d30927a63c8a92f6876a40137ede764e0babab
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_ViewTileLeft.hpp
@@ -0,0 +1,195 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef KOKKOS_VIEWTILELEFT_HPP
+#define KOKKOS_VIEWTILELEFT_HPP
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+namespace Kokkos {
+namespace Impl {
+
+template< class T , unsigned N0 , unsigned N1 , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< T , void , LayoutTileLeft<N0,N1> , MemorySpace , MemoryTraits >
+{
+  typedef ViewDefault type ;
+};
+
+struct ViewTile {};
+
+template< class ShapeType , unsigned N0 , unsigned N1 >
+struct ViewOffset< ShapeType
+                 , LayoutTileLeft<N0,N1,true> /* Only accept properly shaped tiles */
+                 , typename Impl::enable_if<( 2 == ShapeType::rank
+                                              &&
+                                              2 == ShapeType::rank_dynamic
+                                            )>::type >
+  : public ShapeType
+{
+  enum { SHIFT_0 = Impl::power_of_two<N0>::value };
+  enum { SHIFT_1 = Impl::power_of_two<N1>::value };
+  enum { MASK_0  = N0 - 1 };
+  enum { MASK_1  = N1 - 1 };
+
+  typedef size_t                      size_type ;
+  typedef ShapeType                   shape_type ;
+  typedef LayoutTileLeft<N0,N1,true>  array_layout ;
+
+  enum { has_padding = true };
+
+  size_type tile_N0 ;
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( const ViewOffset & rhs )
+    {
+      shape_type::N0 = rhs.N0 ;
+      shape_type::N1 = rhs.N1 ;
+      tile_N0 = ( rhs.N0 + MASK_0 ) >> SHIFT_0 ; // number of tiles in first dimension
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void assign( size_t n0 , size_t n1
+             , int = 0 , int = 0
+             , int = 0 , int = 0
+             , int = 0 , int = 0
+             , int = 0
+             )
+    {
+      shape_type::N0 = n0 ;
+      shape_type::N1 = n1 ;
+      tile_N0 = ( n0 + MASK_0 ) >> SHIFT_0 ; // number of tiles in first dimension
+    }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void set_padding() {}
+
+
+  template< typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION
+  size_type operator()( I0 const & i0 , I1 const & i1
+                      , int = 0 , int = 0
+                      , int = 0 , int = 0
+                      , int = 0 , int = 0
+                      ) const
+    {
+      return /* ( ( Tile offset                             ) *  ( Tile size       ) ) */
+                ( ( (i0>>SHIFT_0) + tile_N0 * (i1>>SHIFT_1) ) << (SHIFT_0 + SHIFT_1) ) +
+             /* ( Offset within tile                       ) */
+                ( (i0 & MASK_0) + ((i1 & MASK_1)<<SHIFT_0) ) ;
+    }
+
+  template< typename I0 , typename I1 >
+  KOKKOS_INLINE_FUNCTION
+  size_type tile_begin( I0 const & i_tile0 , I1 const & i_tile1 ) const
+    {
+      return ( i_tile0 + tile_N0 * i_tile1 ) << ( SHIFT_0 + SHIFT_1 );
+    }
+
+
+  KOKKOS_INLINE_FUNCTION
+  size_type capacity() const
+    {
+      // ( TileDim0 * ( TileDim1 ) ) * TileSize
+      return ( tile_N0 * ( ( shape_type::N1 + MASK_1 ) >> SHIFT_1 ) ) << ( SHIFT_0 + SHIFT_1 );
+    }
+};
+
+template<>
+struct ViewAssignment< ViewTile , void , void >
+{
+  // Some compilers have type-matching issues on the integer values when using:
+  //   template< class T , unsigned N0 , unsigned N1 , class A2 , class A3 >
+  template< class T , unsigned dN0 , unsigned dN1
+          , class A2 , class A3
+          , unsigned sN0 , unsigned sN1 >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment( View< T[dN0][dN1], LayoutLeft, A2, A3, Impl::ViewDefault > & dst
+                , View< T** , LayoutTileLeft<sN0,sN1,true>, A2, A3, Impl::ViewDefault > const & src
+                , size_t const i_tile0
+                , typename Impl::enable_if< unsigned(dN0) == unsigned(sN0) &&
+                                            unsigned(dN1) == unsigned(sN1)
+                                          , size_t const
+                                          >::type i_tile1
+                )
+   {
+     // Destination is always contiguous but source may be non-contiguous
+     // so don't assign the whole view management object.
+     // Just query and appropriately set the reference-count state.
+
+     if ( ! src.m_management.is_managed() ) dst.m_management.set_unmanaged();
+
+     dst.m_ptr_on_device = src.m_ptr_on_device + src.m_offset_map.tile_begin(i_tile0,i_tile1);
+
+     dst.m_tracker = src.m_tracker;
+   }
+};
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+namespace Kokkos {
+
+template< class T , unsigned N0, unsigned N1, class A2, class A3 >
+KOKKOS_INLINE_FUNCTION
+View< T[N0][N1], LayoutLeft, A2, A3, Impl::ViewDefault >
+tile_subview( const View<T**,LayoutTileLeft<N0,N1,true>,A2,A3,Impl::ViewDefault> & src
+            , const size_t i_tile0
+            , const size_t i_tile1
+            )
+{
+  View< T[N0][N1], LayoutLeft, A2, A3, Impl::ViewDefault > dst ;
+
+  (void) Impl::ViewAssignment< Impl::ViewTile , void , void >( dst , src , i_tile0 , i_tile1 );
+
+  return dst ;
+}
+
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif /* #ifndef KOKKOS_VIEWTILELEFT_HPP */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp b/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..420ee63891e6ddb0995ad7bbbcfba2f0548c2bd9
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_Volatile_Load.hpp
@@ -0,0 +1,242 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_VOLATILE_LOAD )
+#define KOKKOS_VOLATILE_LOAD
+
+#if defined( __GNUC__ ) /* GNU C   */ || \
+    defined( __GNUG__ ) /* GNU C++ */ || \
+    defined( __clang__ )
+
+#define KOKKOS_MAY_ALIAS __attribute__((__may_alias__))
+
+#else
+
+#define KOKKOS_MAY_ALIAS
+
+#endif
+
+namespace Kokkos {
+
+//----------------------------------------------------------------------------
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+T volatile_load(T const volatile * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const volatile * const ptr;
+    T64 const volatile * const ptr64;
+    T32 const volatile * const ptr32;
+    T16 const volatile * const ptr16;
+    T8  const volatile * const ptr8;
+  } src = {src_ptr};
+
+  T result;
+
+  union {
+    T   * const ptr;
+    T64 * const ptr64;
+    T32 * const ptr32;
+    T16 * const ptr16;
+    T8  * const ptr8;
+  } dst = {&result};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+
+  return result;
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * const dst_ptr, T const volatile * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const volatile * const ptr;
+    T64 const volatile * const ptr64;
+    T32 const volatile * const ptr32;
+    T16 const volatile * const ptr16;
+    T8  const volatile * const ptr8;
+  } src = {src_ptr};
+
+  union {
+    T   volatile * const ptr;
+    T64 volatile * const ptr64;
+    T32 volatile * const ptr32;
+    T16 volatile * const ptr16;
+    T8  volatile * const ptr8;
+  } dst = {dst_ptr};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * const dst_ptr, T const * const src_ptr)
+{
+  typedef uint64_t KOKKOS_MAY_ALIAS T64;
+  typedef uint32_t KOKKOS_MAY_ALIAS T32;
+  typedef uint16_t KOKKOS_MAY_ALIAS T16;
+  typedef uint8_t  KOKKOS_MAY_ALIAS T8;
+
+  enum {
+    NUM_8  = sizeof(T),
+    NUM_16 = NUM_8 / 2,
+    NUM_32 = NUM_8 / 4,
+    NUM_64 = NUM_8 / 8
+  };
+
+  union {
+    T   const * const ptr;
+    T64 const * const ptr64;
+    T32 const * const ptr32;
+    T16 const * const ptr16;
+    T8  const * const ptr8;
+  } src = {src_ptr};
+
+  union {
+    T   volatile * const ptr;
+    T64 volatile * const ptr64;
+    T32 volatile * const ptr32;
+    T16 volatile * const ptr16;
+    T8  volatile * const ptr8;
+  } dst = {dst_ptr};
+
+  for (int i=0; i < NUM_64; ++i) {
+    dst.ptr64[i] = src.ptr64[i];
+  }
+
+  if ( NUM_64*2 < NUM_32 ) {
+    dst.ptr32[NUM_64*2] = src.ptr32[NUM_64*2];
+  }
+
+  if ( NUM_32*2 < NUM_16 ) {
+    dst.ptr16[NUM_32*2] = src.ptr16[NUM_32*2];
+  }
+
+  if ( NUM_16*2 < NUM_8 ) {
+    dst.ptr8[NUM_16*2] = src.ptr8[NUM_16*2];
+  }
+}
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * dst_ptr, T const volatile & src)
+{ volatile_store(dst_ptr, &src); }
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+void volatile_store(T volatile * dst_ptr, T const & src)
+{ volatile_store(dst_ptr, &src); }
+
+template <typename T>
+KOKKOS_FORCEINLINE_FUNCTION
+T safe_load(T const * const ptr)
+{
+#if !defined( __MIC__ )
+  return *ptr;
+#else
+  return volatile_load(ptr);
+#endif
+}
+
+} // namespace kokkos
+
+#undef KOKKOS_MAY_ALIAS
+
+#endif
+
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..1d173fb4fb42b267953f57ef263bccb7f89f8297
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp
@@ -0,0 +1,704 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#define DEBUG_PRINT 0
+
+#include <iostream>
+#include <sstream>
+
+#include <Kokkos_Macros.hpp>
+#include <Kokkos_hwloc.hpp>
+#include <impl/Kokkos_Error.hpp>
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace hwloc {
+
+/* Return 0 if asynchronous, 1 if synchronous and include process. */
+unsigned thread_mapping( const char * const label ,
+                         const bool allow_async ,
+                         unsigned & thread_count ,
+                         unsigned & use_numa_count ,
+                         unsigned & use_cores_per_numa ,
+                         std::pair<unsigned,unsigned> threads_coord[] )
+{
+  const bool     hwloc_avail            = Kokkos::hwloc::available();
+  const unsigned avail_numa_count       = hwloc_avail ? hwloc::get_available_numa_count() : 1 ;
+  const unsigned avail_cores_per_numa   = hwloc_avail ? hwloc::get_available_cores_per_numa() : thread_count ;
+  const unsigned avail_threads_per_core = hwloc_avail ? hwloc::get_available_threads_per_core() : 1 ;
+
+  // (numa,core) coordinate of the process:
+  const std::pair<unsigned,unsigned> proc_coord = Kokkos::hwloc::get_this_thread_coordinate();
+
+  //------------------------------------------------------------------------
+  // Defaults for unspecified inputs:
+
+  if ( ! use_numa_count ) {
+    // Default to use all NUMA regions
+    use_numa_count = ! thread_count ? avail_numa_count : (
+                       thread_count < avail_numa_count ? thread_count : avail_numa_count );
+  }
+
+  if ( ! use_cores_per_numa ) {
+    // Default to use all but one core if asynchronous, all cores if synchronous.
+    const unsigned threads_per_numa = thread_count / use_numa_count ;
+
+    use_cores_per_numa = ! threads_per_numa ? avail_cores_per_numa - ( allow_async ? 1 : 0 ) : (
+                           threads_per_numa < avail_cores_per_numa ? threads_per_numa : avail_cores_per_numa );
+  }
+
+  if ( ! thread_count ) {
+    thread_count = use_numa_count * use_cores_per_numa * avail_threads_per_core ;
+  }
+
+  //------------------------------------------------------------------------
+  // Input verification:
+
+  const bool valid_numa      = use_numa_count <= avail_numa_count ;
+  const bool valid_cores     = use_cores_per_numa &&
+                               use_cores_per_numa <= avail_cores_per_numa ;
+  const bool valid_threads   = thread_count &&
+                               thread_count <= use_numa_count * use_cores_per_numa * avail_threads_per_core ;
+  const bool balanced_numa   = ! ( thread_count % use_numa_count );
+  const bool balanced_cores  = ! ( thread_count % ( use_numa_count * use_cores_per_numa ) );
+
+  const bool valid_input = valid_numa && valid_cores && valid_threads && balanced_numa && balanced_cores ;
+
+  if ( ! valid_input ) {
+
+    std::ostringstream msg ;
+
+    msg << label << " HWLOC ERROR(s)" ;
+
+    if ( ! valid_threads ) {
+      msg << " : thread_count(" << thread_count
+          << ") exceeds capacity("
+          << use_numa_count * use_cores_per_numa * avail_threads_per_core
+          << ")" ;
+    }
+    if ( ! valid_numa ) {
+      msg << " : use_numa_count(" << use_numa_count
+          << ") exceeds capacity(" << avail_numa_count << ")" ;
+    }
+    if ( ! valid_cores ) {
+      msg << " : use_cores_per_numa(" << use_cores_per_numa
+          << ") exceeds capacity(" << avail_cores_per_numa << ")" ;
+    }
+    if ( ! balanced_numa ) {
+      msg << " : thread_count(" << thread_count
+          << ") imbalanced among numa(" << use_numa_count << ")" ;
+    }
+    if ( ! balanced_cores ) {
+      msg << " : thread_count(" << thread_count
+          << ") imbalanced among cores(" << use_numa_count * use_cores_per_numa << ")" ;
+    }
+
+    Kokkos::Impl::throw_runtime_exception( msg.str() );
+  }
+
+  const unsigned thread_spawn_synchronous =
+    ( allow_async &&
+      1 < thread_count &&
+      ( use_numa_count     < avail_numa_count ||
+        use_cores_per_numa < avail_cores_per_numa ) )
+     ? 0 /* asyncronous */
+     : 1 /* synchronous, threads_coord[0] is process core */ ;
+
+  // Determine binding coordinates for to-be-spawned threads so that
+  // threads may be bound to cores as they are spawned.
+
+  const unsigned threads_per_core = thread_count / ( use_numa_count * use_cores_per_numa );
+
+  if ( thread_spawn_synchronous ) {
+    // Working synchronously and include process core as threads_coord[0].
+    // Swap the NUMA coordinate of the process core with 0
+    // Swap the CORE coordinate of the process core with 0
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = 0 == inuma ? proc_coord.first : ( proc_coord.first == inuma ? 0 : inuma );
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = 0 == icore ? proc_coord.second : ( proc_coord.second == icore ? 0 : icore );
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+  else if ( use_numa_count < avail_numa_count ) {
+    // Working asynchronously and omit the process' NUMA region from the pool.
+    // Swap the NUMA coordinate of the process core with ( ( avail_numa_count - use_numa_count ) - 1 )
+    const unsigned numa_coord_swap = ( avail_numa_count - use_numa_count ) - 1 ;
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = proc_coord.first == inuma ? numa_coord_swap : inuma ;
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = icore ;
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+  else if ( use_cores_per_numa < avail_cores_per_numa ) {
+    // Working asynchronously and omit the process' core from the pool.
+    // Swap the CORE coordinate of the process core with ( ( avail_cores_per_numa - use_cores_per_numa ) - 1 )
+    const unsigned core_coord_swap = ( avail_cores_per_numa - use_cores_per_numa ) - 1 ;
+    for ( unsigned i = 0 , inuma = avail_numa_count - use_numa_count ; inuma < avail_numa_count ; ++inuma ) {
+      const unsigned numa_coord = inuma ;
+      for ( unsigned icore = avail_cores_per_numa - use_cores_per_numa ; icore < avail_cores_per_numa ; ++icore ) {
+        const unsigned core_coord = proc_coord.second == icore ? core_coord_swap : icore ;
+        for ( unsigned ith = 0 ; ith < threads_per_core ; ++ith , ++i ) {
+          threads_coord[i].first  = numa_coord ;
+          threads_coord[i].second = core_coord ;
+        }
+      }
+    }
+  }
+
+  return thread_spawn_synchronous ;
+}
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_HAVE_HWLOC )
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+
+/*--------------------------------------------------------------------------*/
+/* Third Party Libraries */
+
+/* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */
+#include <hwloc.h>
+
+#define  REQUIRED_HWLOC_API_VERSION  0x000010300
+
+#if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION
+#error "Requires  http://www.open-mpi.org/projects/hwloc/  Version 1.3 or greater"
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace hwloc {
+namespace {
+
+#if DEBUG_PRINT
+
+inline
+void print_bitmap( std::ostream & s , const hwloc_const_bitmap_t bitmap )
+{
+  s << "{" ;
+  for ( int i = hwloc_bitmap_first( bitmap ) ;
+        -1 != i ; i = hwloc_bitmap_next( bitmap , i ) ) {
+    s << " " << i ;
+  }
+  s << " }" ;
+}
+
+#endif
+
+enum { MAX_CORE = 1024 };
+
+std::pair<unsigned,unsigned> s_core_topology(0,0);
+unsigned                     s_core_capacity(0);
+hwloc_topology_t             s_hwloc_topology(0);
+hwloc_bitmap_t               s_hwloc_location(0);
+hwloc_bitmap_t               s_process_binding(0);
+hwloc_bitmap_t               s_core[ MAX_CORE ];
+
+struct Sentinel {
+  ~Sentinel();
+  Sentinel();
+};
+
+bool sentinel()
+{
+  static Sentinel self ;
+
+  if ( 0 == s_hwloc_topology ) {
+    std::cerr << "Kokkos::hwloc ERROR : Called after return from main()" << std::endl ;
+    std::cerr.flush();
+  }
+
+  return 0 != s_hwloc_topology ;
+}
+
+Sentinel::~Sentinel()
+{
+  hwloc_topology_destroy( s_hwloc_topology );
+  hwloc_bitmap_free( s_process_binding );
+  hwloc_bitmap_free( s_hwloc_location );
+
+  s_core_topology.first  = 0 ;
+  s_core_topology.second = 0 ;
+  s_core_capacity   = 0 ;
+  s_hwloc_topology  = 0 ;
+  s_hwloc_location  = 0 ;
+  s_process_binding = 0 ;
+}
+
+Sentinel::Sentinel()
+{
+#if defined(__MIC__)
+  static const bool remove_core_0 = true ;
+#else
+  static const bool remove_core_0 = false ;
+#endif
+
+  s_core_topology   = std::pair<unsigned,unsigned>(0,0);
+  s_core_capacity   = 0 ;
+  s_hwloc_topology  = 0 ;
+  s_hwloc_location  = 0 ;
+  s_process_binding = 0 ;
+
+  for ( unsigned i = 0 ; i < MAX_CORE ; ++i ) s_core[i] = 0 ;
+
+  hwloc_topology_init( & s_hwloc_topology );
+  hwloc_topology_load( s_hwloc_topology );
+
+  s_hwloc_location  = hwloc_bitmap_alloc();
+  s_process_binding = hwloc_bitmap_alloc();
+
+  hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
+
+  if ( remove_core_0 ) {
+
+    const hwloc_obj_t core = hwloc_get_obj_by_type( s_hwloc_topology , HWLOC_OBJ_CORE , 0 );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+      hwloc_bitmap_t s_process_no_core_zero = hwloc_bitmap_alloc();
+
+      hwloc_bitmap_andnot( s_process_no_core_zero , s_process_binding , core->allowed_cpuset );
+
+      bool ok = 0 == hwloc_set_cpubind( s_hwloc_topology ,
+                                        s_process_no_core_zero ,
+                                        HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT );
+
+      if ( ok ) {
+        hwloc_get_cpubind( s_hwloc_topology , s_process_binding ,  HWLOC_CPUBIND_PROCESS );
+
+        ok = 0 != hwloc_bitmap_isequal( s_process_binding , s_process_no_core_zero );
+      }
+
+      hwloc_bitmap_free( s_process_no_core_zero );
+
+      if ( ! ok ) {
+        std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ;
+      }
+    }
+  }
+
+  // Choose a hwloc object type for the NUMA level, which may not exist.
+
+  hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX ;
+
+  {
+    // Object types to search, in order.
+    static const hwloc_obj_type_t candidate_root_type[] =
+      { HWLOC_OBJ_NODE     /* NUMA region     */
+      , HWLOC_OBJ_SOCKET   /* hardware socket */
+      , HWLOC_OBJ_MACHINE  /* local machine   */
+      };
+
+    enum { CANDIDATE_ROOT_TYPE_COUNT =
+             sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t) };
+
+    for ( int k = 0 ; k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type ; ++k ) {
+      if ( 0 < hwloc_get_nbobjs_by_type( s_hwloc_topology , candidate_root_type[k] ) ) {
+        root_type = candidate_root_type[k] ;
+      }
+    }
+  }
+
+  // Determine which of these 'root' types are available to this process.
+  // The process may have been bound (e.g., by MPI) to a subset of these root types.
+  // Determine current location of the master (calling) process>
+
+  hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc();
+
+  hwloc_get_last_cpu_location( s_hwloc_topology , proc_cpuset_location , HWLOC_CPUBIND_THREAD );
+
+  const unsigned max_root = hwloc_get_nbobjs_by_type( s_hwloc_topology , root_type );
+
+  unsigned root_base     = max_root ;
+  unsigned root_count    = 0 ;
+  unsigned core_per_root = 0 ;
+  unsigned pu_per_core   = 0 ;
+  bool     symmetric     = true ;
+
+  for ( unsigned i = 0 ; i < max_root ; ++i ) {
+
+    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , i );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
+
+      ++root_count ;
+
+      // Remember which root (NUMA) object the master thread is running on.
+      // This will be logical NUMA rank #0 for this process.
+
+      if ( hwloc_bitmap_intersects( proc_cpuset_location, root->allowed_cpuset ) ) {
+        root_base = i ;
+      }
+
+      // Count available cores:
+
+      const unsigned max_core =
+        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                root->allowed_cpuset ,
+                                                HWLOC_OBJ_CORE );
+
+      unsigned core_count = 0 ;
+
+      for ( unsigned j = 0 ; j < max_core ; ++j ) {
+
+        const hwloc_obj_t core =
+          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
+                                               root->allowed_cpuset ,
+                                               HWLOC_OBJ_CORE , j );
+
+        // If process' cpuset intersects core's cpuset then process can access this core.
+        // Must use intersection instead of inclusion because the Intel-Phi
+        // MPI may bind the process to only one of the core's hyperthreads.
+        //
+        // Assumption: if the process can access any hyperthread of the core
+        // then it has ownership of the entire core.
+        // This assumes that it would be performance-detrimental
+        // to spawn more than one MPI process per core and use nested threading.
+
+        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+          ++core_count ;
+
+          const unsigned pu_count =
+            hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                    core->allowed_cpuset ,
+                                                    HWLOC_OBJ_PU );
+
+          if ( pu_per_core == 0 ) pu_per_core = pu_count ;
+
+          // Enforce symmetry by taking the minimum:
+
+          pu_per_core = std::min( pu_per_core , pu_count );
+
+          if ( pu_count != pu_per_core ) symmetric = false ;
+        }
+      }
+
+      if ( 0 == core_per_root ) core_per_root = core_count ;
+
+      // Enforce symmetry by taking the minimum:
+
+      core_per_root = std::min( core_per_root , core_count );
+
+      if ( core_count != core_per_root ) symmetric = false ;
+    }
+  }
+
+  s_core_topology.first  = root_count ;
+  s_core_topology.second = core_per_root ;
+  s_core_capacity        = pu_per_core ;
+
+  // Fill the 's_core' array for fast mapping from a core coordinate to the
+  // hwloc cpuset object required for thread location querying and binding.
+
+  for ( unsigned i = 0 ; i < max_root ; ++i ) {
+
+    const unsigned root_rank = ( i + root_base ) % max_root ;
+
+    const hwloc_obj_t root = hwloc_get_obj_by_type( s_hwloc_topology , root_type , root_rank );
+
+    if ( hwloc_bitmap_intersects( s_process_binding , root->allowed_cpuset ) ) {
+
+      const unsigned max_core =
+        hwloc_get_nbobjs_inside_cpuset_by_type( s_hwloc_topology ,
+                                                root->allowed_cpuset ,
+                                                HWLOC_OBJ_CORE );
+
+      unsigned core_count = 0 ;
+
+      for ( unsigned j = 0 ; j < max_core && core_count < core_per_root ; ++j ) {
+
+        const hwloc_obj_t core =
+          hwloc_get_obj_inside_cpuset_by_type( s_hwloc_topology ,
+                                               root->allowed_cpuset ,
+                                               HWLOC_OBJ_CORE , j );
+
+        if ( hwloc_bitmap_intersects( s_process_binding , core->allowed_cpuset ) ) {
+
+          s_core[ core_count + core_per_root * i ] = core->allowed_cpuset ;
+
+          ++core_count ;
+        }
+      }
+    }
+  }
+
+  hwloc_bitmap_free( proc_cpuset_location );
+
+  if ( ! symmetric ) {
+    std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology."
+              << std::endl ;
+  }
+}
+
+
+} // namespace
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+bool available()
+{ return true ; }
+
+unsigned get_available_numa_count()
+{ sentinel(); return s_core_topology.first ; }
+
+unsigned get_available_cores_per_numa()
+{ sentinel(); return s_core_topology.second ; }
+
+unsigned get_available_threads_per_core()
+{ sentinel(); return s_core_capacity ; }
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+unsigned bind_this_thread(
+  const unsigned               coordinate_count ,
+  std::pair<unsigned,unsigned> coordinate[] )
+{
+  unsigned i = 0 ;
+
+  try {
+    const std::pair<unsigned,unsigned> current = get_this_thread_coordinate();
+
+    // Match one of the requests:
+    for ( i = 0 ; i < coordinate_count && current != coordinate[i] ; ++i );
+
+    if ( coordinate_count == i ) {
+      // Match the first request (typically NUMA):
+      for ( i = 0 ; i < coordinate_count && current.first != coordinate[i].first ; ++i );
+    }
+
+    if ( coordinate_count == i ) {
+      // Match any unclaimed request:
+      for ( i = 0 ; i < coordinate_count && ~0u == coordinate[i].first  ; ++i );
+    }
+
+    if ( coordinate_count == i || ! bind_this_thread( coordinate[i] ) ) {
+       // Failed to bind:
+       i = ~0u ;
+    }
+
+    if ( i < coordinate_count ) {
+
+#if DEBUG_PRINT
+      if ( current != coordinate[i] ) {
+        std::cout << "  bind_this_thread: rebinding from ("
+                  << current.first << ","
+                  << current.second
+                  << ") to ("
+                  << coordinate[i].first << ","
+                  << coordinate[i].second
+                  << ")" << std::endl ;
+      }
+#endif
+
+      coordinate[i].first  = ~0u ;
+      coordinate[i].second = ~0u ;
+    }
+  }
+  catch( ... ) {
+    i = ~0u ;
+  }
+
+  return i ;
+}
+
+
+bool bind_this_thread( const std::pair<unsigned,unsigned> coord )
+{
+  if ( ! sentinel() ) return false ;
+
+#if DEBUG_PRINT
+
+  std::cout << "Kokkos::bind_this_thread() at " ;
+
+  hwloc_get_last_cpu_location( s_hwloc_topology ,
+                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+  std::cout << " to " ;
+
+  print_bitmap( std::cout , s_core[ coord.second + coord.first * s_core_topology.second ] );
+
+  std::cout << std::endl ;
+
+#endif
+
+  // As safe and fast as possible.
+  // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'.
+  return coord.first  < s_core_topology.first &&
+         coord.second < s_core_topology.second &&
+         0 == hwloc_set_cpubind( s_hwloc_topology ,
+                                 s_core[ coord.second + coord.first * s_core_topology.second ] ,
+                                 HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
+}
+
+bool unbind_this_thread()
+{
+  if ( ! sentinel() ) return false ;
+
+#define HWLOC_DEBUG_PRINT 0
+
+#if HWLOC_DEBUG_PRINT
+
+  std::cout << "Kokkos::unbind_this_thread() from " ;
+
+  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+#endif
+
+  const bool result =
+    s_hwloc_topology &&
+    0 == hwloc_set_cpubind( s_hwloc_topology ,
+                            s_process_binding ,
+                            HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT );
+
+#if HWLOC_DEBUG_PRINT
+
+  std::cout << " to " ;
+
+  hwloc_get_cpubind( s_hwloc_topology , s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  print_bitmap( std::cout , s_hwloc_location );
+
+  std::cout << std::endl ;
+
+#endif
+
+  return result ;
+
+#undef HWLOC_DEBUG_PRINT
+
+}
+
+//----------------------------------------------------------------------------
+
+std::pair<unsigned,unsigned> get_this_thread_coordinate()
+{
+  std::pair<unsigned,unsigned> coord(0u,0u);
+
+  if ( ! sentinel() ) return coord ;
+
+  const unsigned n = s_core_topology.first * s_core_topology.second ;
+
+  // Using the pre-allocated 's_hwloc_location' to avoid memory
+  // allocation by this thread.  This call is NOT thread-safe.
+  hwloc_get_last_cpu_location( s_hwloc_topology ,
+                               s_hwloc_location , HWLOC_CPUBIND_THREAD );
+
+  unsigned i = 0 ;
+
+  while ( i < n && ! hwloc_bitmap_intersects( s_hwloc_location , s_core[ i ] ) ) ++i ;
+
+  if ( i < n ) {
+    coord.first  = i / s_core_topology.second ;
+    coord.second = i % s_core_topology.second ;
+  }
+
+  return coord ;
+}
+
+//----------------------------------------------------------------------------
+
+} /* namespace hwloc */
+} /* namespace Kokkos */
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#else /* ! defined( KOKKOS_HAVE_HWLOC ) */
+
+namespace Kokkos {
+namespace hwloc {
+
+bool available() { return false ; }
+
+unsigned get_available_numa_count() { return 1 ; }
+unsigned get_available_cores_per_numa() { return 1 ; }
+unsigned get_available_threads_per_core() { return 1 ; }
+
+unsigned bind_this_thread( const unsigned , std::pair<unsigned,unsigned>[] )
+{ return ~0 ; }
+
+bool bind_this_thread( const std::pair<unsigned,unsigned> )
+{ return false ; }
+
+bool unbind_this_thread()
+{ return true ; }
+
+std::pair<unsigned,unsigned> get_this_thread_coordinate()
+{ return std::pair<unsigned,unsigned>(0,0); }
+
+} // namespace hwloc
+} // namespace Kokkos
+
+//----------------------------------------------------------------------------
+//----------------------------------------------------------------------------
+
+#endif
+
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..abd845da9123d1f1b659faa1d5c167b9528f4fe4
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.cpp
@@ -0,0 +1,82 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Macros.hpp>
+#include <impl/Kokkos_spinwait.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if ( KOKKOS_ENABLE_ASM )
+  #if defined( __arm__ )
+    /* No-operation instruction to idle the thread. */
+    #define YIELD   asm volatile("nop")
+  #else
+    /* Pause instruction to prevent excess processor bus usage */
+    #define YIELD   asm volatile("pause\n":::"memory")
+  #endif
+#elif defined ( KOKKOS_HAVE_WINTHREAD )
+  #include <process.h>
+  #define YIELD  Sleep(0)
+#elif defined ( _WIN32 )
+  #define YIELD   __asm__ __volatile__("pause\n":::"memory")
+#else
+  #include <sched.h>
+  #define YIELD  sched_yield()
+#endif
+
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+namespace Impl {
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+void spinwait( volatile int & flag , const int value )
+{
+  while ( value == flag ) {
+    YIELD ;
+  }
+}
+#endif
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
diff --git a/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..cc87771faefcb8ad7716842890dbec4a9c1219a1
--- /dev/null
+++ b/lib/kokkos/core/src/impl/Kokkos_spinwait.hpp
@@ -0,0 +1,64 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_SPINWAIT_HPP
+#define KOKKOS_SPINWAIT_HPP
+
+#include <Kokkos_Macros.hpp>
+
+namespace Kokkos {
+namespace Impl {
+
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+void spinwait( volatile int & flag , const int value );
+#else
+KOKKOS_INLINE_FUNCTION
+void spinwait( volatile int & , const int ) {}
+#endif
+
+} /* namespace Impl */
+} /* namespace Kokkos */
+
+#endif /* #ifndef KOKKOS_SPINWAIT_HPP */
+
diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile
new file mode 100755
index 0000000000000000000000000000000000000000..b2d3d55066406c6911929ce3659f0a9e50187c2a
--- /dev/null
+++ b/lib/kokkos/core/unit_test/Makefile
@@ -0,0 +1,146 @@
+KOKKOS_PATH = ../..
+
+GTEST_PATH = ../../TPL/gtest
+
+vpath %.cpp ${KOKKOS_PATH}/core/unit_test
+TEST_HEADERS = $(wildcard $(KOKKOS_PATH)/core/unit_test/*.hpp)
+
+default: build_all
+	echo "End Build"
+	
+
+include $(KOKKOS_PATH)/Makefile.kokkos
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	CXX = nvcc_wrapper
+	CXXFLAGS ?= -O3
+	LINK = $(CXX)
+	LDFLAGS ?= -lpthread
+else
+	CXX ?= g++
+	CXXFLAGS ?= -O3
+	LINK ?= $(CXX)
+	LDFLAGS ?= -lpthread
+endif
+
+KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/core/unit_test
+
+TEST_TARGETS = 
+TARGETS = 
+
+ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
+	OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Cuda
+	TEST_TARGETS += test-cuda
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
+	OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Threads
+	TEST_TARGETS += test-threads
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
+	OBJ_OPENMP = TestOpenMP.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_OpenMP
+	TEST_TARGETS += test-openmp
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
+	OBJ_SERIAL = TestSerial.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Serial
+	TEST_TARGETS += test-serial
+endif
+
+ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
+	OBJ_QTHREAD = TestQthread.o UnitTestMain.o gtest-all.o
+	TARGETS += KokkosCore_UnitTest_Qthread
+	TEST_TARGETS += test-qthread
+endif
+
+OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o
+TARGETS += KokkosCore_UnitTest_HWLOC
+TEST_TARGETS += test-hwloc
+
+OBJ_ALLOCATIONTRACKER = TestAllocationTracker.o UnitTestMain.o gtest-all.o
+TARGETS += KokkosCore_UnitTest_AllocationTracker
+TEST_TARGETS += test-allocationtracker
+
+OBJ_DEFAULT = TestDefaultDeviceType.o UnitTestMain.o gtest-all.o
+TARGETS += KokkosCore_UnitTest_Default
+TEST_TARGETS += test-default
+
+OBJ_DEFAULTINIT = TestDefaultDeviceTypeInit.o UnitTestMain.o gtest-all.o
+TARGETS += KokkosCore_UnitTest_DefaultInit
+TEST_TARGETS += test-default-init
+
+
+KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Cuda
+
+KokkosCore_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Threads
+	
+KokkosCore_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_OPENMP) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_OpenMP
+
+KokkosCore_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Serial
+
+KokkosCore_UnitTest_Qthread: $(OBJ_QTHREAD) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREAD) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthread
+
+KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_HWLOC
+
+KokkosCore_UnitTest_AllocationTracker: $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_AllocationTracker
+
+KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Default
+
+KokkosCore_UnitTest_DefaultInit: $(OBJ_DEFAULTINIT) $(KOKKOS_LINK_DEPENDS)
+	$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_DEFAULTINIT) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultInit
+
+test-cuda: KokkosCore_UnitTest_Cuda
+	./KokkosCore_UnitTest_Cuda
+
+test-threads: KokkosCore_UnitTest_Threads
+	./KokkosCore_UnitTest_Threads
+
+test-openmp: KokkosCore_UnitTest_OpenMP
+	./KokkosCore_UnitTest_OpenMP
+
+test-serial: KokkosCore_UnitTest_Serial
+	./KokkosCore_UnitTest_Serial
+	
+test-qthread: KokkosCore_UnitTest_Qthread
+	./KokkosCore_UnitTest_Qthread
+
+test-hwloc: KokkosCore_UnitTest_HWLOC
+	./KokkosCore_UnitTest_HWLOC
+	
+test-allocationtracker: KokkosCore_UnitTest_AllocationTracker
+	./KokkosCore_UnitTest_AllocationTracker
+	
+test-default: KokkosCore_UnitTest_Default
+	./KokkosCore_UnitTest_Default
+	
+test-default-init: KokkosCore_UnitTest_DefaultInit
+	./KokkosCore_UnitTest_DefaultInit
+
+build_all: $(TARGETS)
+
+test: $(TEST_TARGETS)
+	
+clean: kokkos-clean 
+	rm -f *.o $(TARGETS)
+
+# Compilation rules
+
+%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(TEST_HEADERS)
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $<
+
+gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc 
+	$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
+
diff --git a/lib/kokkos/core/unit_test/TestAggregate.hpp b/lib/kokkos/core/unit_test/TestAggregate.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..35e7a8930d81115b99b8f7e7fad4258a22c204ca
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestAggregate.hpp
@@ -0,0 +1,716 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_AGGREGATE_HPP
+#define TEST_AGGREGATE_HPP
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+struct EmbedArray {};
+
+struct ArrayProxyContiguous {};
+struct ArrayProxyStrided {};
+
+template< typename T , unsigned N = 0 , class Proxy = void >
+struct Array ;
+
+template< typename T >
+struct Array<T,0,ArrayProxyContiguous>
+{
+public:
+  typedef T value_type ;
+
+  enum { StaticLength = 0 };
+  T * const value ;
+  const unsigned count ;
+
+  KOKKOS_INLINE_FUNCTION
+  Array( T * v , unsigned n ) : value(v), count(n) {}
+
+  template< class Proxy >
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array<T,0,Proxy> & rhs ) { return *this ; }
+};
+
+template< typename T , unsigned N >
+struct Array<T,N,ArrayProxyContiguous>
+{
+public:
+  typedef T value_type ;
+
+  enum { StaticLength = N };
+  T * const value ;
+
+  KOKKOS_INLINE_FUNCTION
+  Array( T * v , unsigned ) : value(v) {}
+
+  template< class Proxy >
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array<T,N,Proxy> & rhs ) { return *this ; }
+};
+
+template< typename T , unsigned N >
+struct Array<T,N,ArrayProxyStrided>
+{
+public:
+  typedef T value_type ;
+
+  enum { StaticLength = N };
+  T * const value ;
+  const unsigned stride ;
+
+  KOKKOS_INLINE_FUNCTION
+  Array( T * v , unsigned , unsigned s ) : value(v), stride(s) {}
+
+  template< class Proxy >
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array<T,N,Proxy> & rhs ) { return *this ; }
+};
+
+template< typename T >
+struct Array<T,0,ArrayProxyStrided>
+{
+public:
+  typedef T value_type ;
+
+  enum { StaticLength = 0 };
+  T * const value ;
+  const unsigned count ;
+  const unsigned stride ;
+
+  KOKKOS_INLINE_FUNCTION
+  Array( T * v , unsigned n , unsigned s ) : value(v), count(n), stride(s) {}
+
+  template< class Proxy >
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array<T,0,Proxy> & rhs ) { return *this ; }
+};
+
+template< typename T >
+struct Array<T,0,void>
+{
+public:
+  typedef T value_type ;
+
+  enum { StaticLength = 0 };
+  T * value ;
+  const unsigned count ;
+
+  KOKKOS_INLINE_FUNCTION
+  Array() : value(0) , count(0) {}
+
+  template< unsigned N , class Proxy >
+  KOKKOS_INLINE_FUNCTION
+  Array( const Array<T,N,Proxy> & rhs ) : value(rhs.value), count(N) {}
+};
+
+template< typename T , unsigned N >
+struct Array<T,N,void>
+{
+public:
+  typedef T value_type ;
+
+  enum { StaticLength = N };
+  T value[N] ;
+
+  template< class Proxy >
+  KOKKOS_INLINE_FUNCTION
+  Array & operator = ( const Array<T,N,Proxy> & ) { return *this ; }
+};
+
+} // namespace Test
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+namespace Kokkos {
+namespace Impl {
+
+template< typename T , unsigned N >
+struct AnalyzeShape< Test::Array< T , N > >
+  : public ShapeInsert< typename AnalyzeShape< T >::shape , N >::type
+{
+private:
+  typedef AnalyzeShape< T > nested ;
+public:
+
+  typedef Test::EmbedArray specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , N >::type shape ;
+
+  typedef typename nested::array_intrinsic_type   array_intrinsic_type[ N ];
+  typedef Test::Array< T , N >          value_type ;
+  typedef Test::Array< T , N >          type ;
+
+  typedef const array_intrinsic_type  const_array_intrinsic_type ;
+  typedef const value_type  const_value_type ;
+  typedef const type        const_type ;
+
+  typedef typename nested::non_const_array_intrinsic_type          non_const_array_intrinsic_type[ N ];
+  typedef Test::Array< typename nested::non_const_value_type , N > non_const_value_type ;
+  typedef Test::Array< typename nested::non_const_value_type , N > non_const_type ;
+};
+
+template< typename T >
+struct AnalyzeShape< Test::Array< T , 0 > >
+  : public ShapeInsert< typename AnalyzeShape< T >::shape , 0 >::type
+{
+private:
+  typedef AnalyzeShape< T > nested ;
+public:
+
+  typedef Test::EmbedArray specialize ;
+
+  typedef typename ShapeInsert< typename nested::shape , 0 >::type shape ;
+
+  typedef typename nested::array_intrinsic_type * array_intrinsic_type ;
+  typedef Test::Array< T , 0 >          value_type ;
+  typedef Test::Array< T , 0 >          type ;
+
+  typedef const array_intrinsic_type  const_array_intrinsic_type ;
+  typedef const value_type  const_value_type ;
+  typedef const type        const_type ;
+
+  typedef typename nested::non_const_array_intrinsic_type  * non_const_array_intrinsic_type ;
+  typedef Test::Array< typename nested::non_const_value_type , 0 > non_const_value_type ;
+  typedef Test::Array< typename nested::non_const_value_type , 0 > non_const_type ;
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class ValueType , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< ValueType
+                     , Test::EmbedArray
+                     , LayoutLeft
+                     , MemorySpace
+                     , MemoryTraits >
+{ typedef Test::EmbedArray type ; };
+
+template< class ValueType , class MemorySpace , class MemoryTraits >
+struct ViewSpecialize< ValueType
+                     , Test::EmbedArray
+                     , LayoutRight
+                     , MemorySpace
+                     , MemoryTraits >
+{ typedef Test::EmbedArray type ; };
+
+/*--------------------------------------------------------------------------*/
+
+template<>
+struct ViewAssignment< Test::EmbedArray , Test::EmbedArray , void >
+{
+  //------------------------------------
+  /** \brief  Compatible value and shape */
+
+  template< class DT , class DL , class DD , class DM ,
+            class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment(       View<DT,DL,DD,DM,Test::EmbedArray> & dst
+                , const View<ST,SL,SD,SM,Test::EmbedArray> & src
+                , const typename enable_if<(
+                    ViewAssignable< ViewTraits<DT,DL,DD,DM> ,
+                                    ViewTraits<ST,SL,SD,SM> >::value
+                    )>::type * = 0
+                  )
+  {
+    dst.m_offset_map.assign( src.m_offset_map );
+
+    dst.m_ptr_on_device = src.m_ptr_on_device ;
+
+    dst.m_tracker = src.m_tracker;
+  }
+};
+
+template<>
+struct ViewAssignment< ViewDefault , Test::EmbedArray , void >
+{
+  //------------------------------------
+  /** \brief  Compatible value and shape */
+
+  template< class ST , class SL , class SD , class SM >
+  KOKKOS_INLINE_FUNCTION
+  ViewAssignment( typename View<ST,SL,SD,SM,Test::EmbedArray>::array_type & dst
+                , const View<ST,SL,SD,SM,Test::EmbedArray> & src
+                )
+  {
+    dst.m_offset_map.assign( src.m_offset_map );
+
+    dst.m_ptr_on_device = src.m_ptr_on_device ;
+
+    dst.m_tracker = src.m_tracker;
+  }
+};
+
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Kokkos {
+
+template< class DataType ,
+          class Arg1Type ,
+          class Arg2Type ,
+          class Arg3Type >
+class View< DataType , Arg1Type , Arg2Type , Arg3Type , Test::EmbedArray >
+  : public ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type >
+{
+public:
+
+  typedef ViewTraits< DataType , Arg1Type , Arg2Type, Arg3Type > traits ;
+
+private:
+
+  // Assignment of compatible views requirement:
+  template< class , class , class , class , class > friend class View ;
+
+  // Assignment of compatible subview requirement:
+  template< class , class , class > friend struct Impl::ViewAssignment ;
+
+  typedef Impl::ViewOffset< typename traits::shape_type ,
+                            typename traits::array_layout > offset_map_type ;
+
+  typedef Impl::ViewDataManagement< traits > view_data_management ;
+
+  // traits::value_type = Test::Array< T , N >
+
+  typename traits::value_type::value_type * m_ptr_on_device ;
+  offset_map_type                           m_offset_map ;
+  view_data_management                      m_management ;
+  Impl::AllocationTracker                   m_tracker ;
+
+public:
+
+  typedef View< typename traits::array_intrinsic_type ,
+                typename traits::array_layout ,
+                typename traits::execution_space ,
+                typename traits::memory_traits > array_type ;
+
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::execution_space ,
+                typename traits::memory_traits > non_const_type ;
+
+  typedef View< typename traits::const_data_type ,
+                typename traits::array_layout ,
+                typename traits::execution_space ,
+                typename traits::memory_traits > const_type ;
+
+  typedef View< typename traits::non_const_data_type ,
+                typename traits::array_layout ,
+                typename traits::host_mirror_space ,
+                void > HostMirror ;
+
+  //------------------------------------
+  // Shape
+
+  enum { Rank = traits::rank - 1 };
+
+  KOKKOS_INLINE_FUNCTION typename traits::shape_type shape() const { return m_offset_map ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_0() const { return m_offset_map.N0 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_1() const { return m_offset_map.N1 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_2() const { return m_offset_map.N2 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_3() const { return m_offset_map.N3 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_4() const { return m_offset_map.N4 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_5() const { return m_offset_map.N5 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_6() const { return m_offset_map.N6 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type dimension_7() const { return m_offset_map.N7 ; }
+  KOKKOS_INLINE_FUNCTION typename traits::size_type size() const
+  {
+    return   m_offset_map.N0
+           * m_offset_map.N1
+           * m_offset_map.N2
+           * m_offset_map.N3
+           * m_offset_map.N4
+           * m_offset_map.N5
+           * m_offset_map.N6
+           * m_offset_map.N7
+           ;
+  }
+
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type dimension( const iType & i ) const
+    { return Impl::dimension( m_offset_map , i ); }
+
+  //------------------------------------
+  // Destructor, constructors, assignment operators:
+
+  KOKKOS_INLINE_FUNCTION
+  ~View() {}
+
+  KOKKOS_INLINE_FUNCTION
+  View()
+    : m_ptr_on_device(0)
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+  { m_offset_map.assing(0,0,0,0,0,0,0,0); }
+
+  KOKKOS_INLINE_FUNCTION
+  View( const View & rhs )
+    : m_ptr_on_device(0)
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+  {
+    (void) Impl::ViewAssignment<
+      typename traits::specialize ,
+      typename traits::specialize >( *this , rhs );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View & rhs )
+    {
+      (void) Impl::ViewAssignment<
+        typename traits::specialize ,
+        typename traits::specialize >( *this , rhs );
+      return *this ;
+    }
+
+  //------------------------------------
+  // Construct or assign compatible view:
+
+  template< class RT , class RL , class RD , class RM , class RS >
+  KOKKOS_INLINE_FUNCTION
+  View( const View<RT,RL,RD,RM,RS> & rhs )
+    : m_ptr_on_device(0)
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+  {
+    (void) Impl::ViewAssignment<
+      typename traits::specialize , RS >( *this , rhs );
+  }
+
+  template< class RT , class RL , class RD , class RM , class RS >
+  KOKKOS_INLINE_FUNCTION
+  View & operator = ( const View<RT,RL,RD,RM,RS> & rhs )
+    {
+      (void) Impl::ViewAssignment<
+        typename traits::specialize , RS >( *this , rhs );
+      return *this ;
+    }
+
+  //------------------------------------
+  // Allocation of a managed view with possible alignment padding.
+
+  template< class AllocationProperties >
+  explicit inline
+  View( const AllocationProperties & prop ,
+        const typename Impl::ViewAllocProp< traits , AllocationProperties >::size_type n0 = 0 ,
+        const size_t n1 = 0 ,
+        const size_t n2 = 0 ,
+        const size_t n3 = 0 ,
+        const size_t n4 = 0 ,
+        const size_t n5 = 0 ,
+        const size_t n6 = 0 ,
+        const size_t n7 = 0 )
+    : m_ptr_on_device(0)
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+  {
+    typedef Impl::ViewAllocProp< traits , AllocationProperties > Alloc ;
+
+    typedef typename traits::memory_space  memory_space ;
+    typedef typename traits::value_type::value_type   scalar_type ;
+
+    m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+    m_offset_map.set_padding();
+
+    m_tracker = memory_space::allocate_and_track( Alloc::label( prop ), sizeof(scalar_type) * m_offset_map.capacity() );
+
+    m_ptr_on_device = reinterpret_cast<scalar_type *>(m_tracker.alloc_ptr());
+
+    (void) Impl::ViewDefaultConstruct< typename traits::execution_space , scalar_type , Alloc::Initialize >( m_ptr_on_device , m_offset_map.capacity() );
+  }
+
+  //------------------------------------
+  // Assign an unmanaged View from pointer, can be called in functors.
+  // No alignment padding is performed.
+
+  typedef Impl::if_c< ! traits::is_managed ,
+                      typename traits::value_type::value_type * ,
+                      Impl::ViewError::user_pointer_constructor_requires_unmanaged >
+    if_user_pointer_constructor ;
+
+  View( typename if_user_pointer_constructor::type ptr ,
+        const size_t n0 = 0 ,
+        const size_t n1 = 0 ,
+        const size_t n2 = 0 ,
+        const size_t n3 = 0 ,
+        const size_t n4 = 0 ,
+        const size_t n5 = 0 ,
+        const size_t n6 = 0 ,
+        const size_t n7 = 0 )
+    : m_ptr_on_device(0)
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+  {
+    m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+    m_ptr_on_device = if_user_pointer_constructor::select( ptr );
+    m_management.set_unmanaged();
+  }
+
+  //------------------------------------
+  // Assign unmanaged View to portion of Device shared memory
+
+  typedef Impl::if_c< ! traits::is_managed ,
+                      typename traits::execution_space ,
+                      Impl::ViewError::device_shmem_constructor_requires_unmanaged >
+      if_device_shmem_constructor ;
+
+  explicit KOKKOS_INLINE_FUNCTION
+  View( typename if_device_shmem_constructor::type & dev ,
+        const unsigned n0 = 0 ,
+        const unsigned n1 = 0 ,
+        const unsigned n2 = 0 ,
+        const unsigned n3 = 0 ,
+        const unsigned n4 = 0 ,
+        const unsigned n5 = 0 ,
+        const unsigned n6 = 0 ,
+        const unsigned n7 = 0 )
+    : m_ptr_on_device(0)
+    , m_offset_map()
+    , m_management()
+    , m_tracker()
+  {
+    typedef typename traits::value_type::value_type   scalar_type ;
+
+    enum { align = 8 };
+    enum { mask  = align - 1 };
+
+    typedef Impl::if_c< ! traits::is_managed ,
+                        scalar_type * ,
+                        Impl::ViewError::device_shmem_constructor_requires_unmanaged >
+      if_device_shmem_pointer ;
+
+    m_offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+
+    // Select the first argument:
+    m_ptr_on_device = if_device_shmem_pointer::select(
+     (scalar_type *) dev.get_shmem( unsigned( sizeof(scalar_type) * m_offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ) );
+  }
+
+  static inline
+  unsigned shmem_size( const unsigned n0 = 0 ,
+                       const unsigned n1 = 0 ,
+                       const unsigned n2 = 0 ,
+                       const unsigned n3 = 0 ,
+                       const unsigned n4 = 0 ,
+                       const unsigned n5 = 0 ,
+                       const unsigned n6 = 0 ,
+                       const unsigned n7 = 0 )
+  {
+    enum { align = 8 };
+    enum { mask  = align - 1 };
+
+    typedef typename traits::value_type::value_type   scalar_type ;
+
+    offset_map_type offset_map ;
+
+    offset_map.assign( n0, n1, n2, n3, n4, n5, n6, n7 );
+
+    return unsigned( sizeof(scalar_type) * offset_map.capacity() + unsigned(mask) ) & ~unsigned(mask) ;
+  }
+
+  //------------------------------------
+  // Is not allocated
+
+  KOKKOS_INLINE_FUNCTION
+  bool is_null() const { return 0 == m_ptr_on_device ; }
+
+  //------------------------------------
+  // LayoutLeft, rank 2:
+
+  typedef Test::Array< typename traits::value_type::value_type ,
+                       traits::value_type::StaticLength ,
+                       Test::ArrayProxyStrided > LeftValue ;
+
+  template< typename iType0 >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type
+    operator[] ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 );
+    }
+
+  template< typename iType0 >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type
+    operator() ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 );
+    }
+
+  template< typename iType0 >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< LeftValue , traits, LayoutLeft, 2, iType0 >::type
+    at( const iType0 & i0 , const int , const int , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return LeftValue( m_ptr_on_device + i0 , m_offset_map.N1 , m_offset_map.S0 );
+    }
+
+  //------------------------------------
+  // LayoutRight, rank 2:
+
+  typedef Test::Array< typename traits::value_type::value_type ,
+                       traits::value_type::StaticLength ,
+                       Test::ArrayProxyContiguous > RightValue ;
+
+  template< typename iType0 >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type
+    operator[] ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 );
+    }
+
+  template< typename iType0 >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type
+    operator() ( const iType0 & i0 ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 );
+    }
+
+  template< typename iType0 >
+  KOKKOS_INLINE_FUNCTION
+  typename Impl::ViewEnableArrayOper< RightValue , traits, LayoutRight, 2, iType0 >::type
+    at( const iType0 & i0 , const int , const int , const int ,
+        const int , const int , const int , const int ) const
+    {
+      KOKKOS_ASSERT_SHAPE_BOUNDS_2( m_offset_map, i0, 0 );
+      KOKKOS_RESTRICT_EXECUTION_TO_DATA( typename traits::memory_space , m_ptr_on_device );
+
+      return RightValue( m_ptr_on_device + i0 , m_offset_map.N1 );
+    }
+
+  //------------------------------------
+  // Access to the underlying contiguous storage of this view specialization.
+  // These methods are specific to specialization of a view.
+
+  KOKKOS_INLINE_FUNCTION
+  typename traits::value_type::value_type * ptr_on_device() const { return m_ptr_on_device ; }
+
+  // Stride of physical storage, dimensioned to at least Rank
+  template< typename iType >
+  KOKKOS_INLINE_FUNCTION
+  void stride( iType * const s ) const
+    { m_offset_map.stride( s ); }
+
+  // Count of contiguously allocated data members including padding.
+  KOKKOS_INLINE_FUNCTION
+  typename traits::size_type capacity() const
+    { return m_offset_map.capacity(); }
+};
+
+} // namespace Kokkos
+
+#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
+
+/*--------------------------------------------------------------------------*/
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class DeviceType >
+int TestViewAggregate()
+{
+
+#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+  typedef Kokkos::View< Test::Array<double,32> * , DeviceType > a32_type ;
+  typedef typename a32_type::array_type a32_base_type ;
+
+  typedef Kokkos::View< Test::Array<double> * , DeviceType > a0_type ;
+  typedef typename a0_type::array_type a0_base_type ;
+
+  a32_type      a32("a32",100);
+  a32_base_type a32_base ;
+
+  a0_type       a0("a0",100,32);
+  a0_base_type  a0_base ;
+
+  a32_base = a32 ;
+  a0_base = a0 ;
+
+#endif /* #if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW ) */
+
+  return 0 ;
+}
+
+}
+
+
+#endif /* #ifndef TEST_AGGREGATE_HPP */
diff --git a/lib/kokkos/core/unit_test/TestAggregateReduction.hpp b/lib/kokkos/core/unit_test/TestAggregateReduction.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..7175d34348f4f7f7b1db353fd470635aa77a4341
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestAggregateReduction.hpp
@@ -0,0 +1,189 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#ifndef TEST_AGGREGATE_REDUCTION_HPP
+#define TEST_AGGREGATE_REDUCTION_HPP
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+namespace Test {
+
+template< typename T , unsigned N >
+struct StaticArray {
+  T value[N] ;
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray()
+    { for ( unsigned i = 0 ; i < N ; ++i ) value[i] = T(); }
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray( const StaticArray & rhs )
+    { for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs.value[i]; }
+
+  KOKKOS_INLINE_FUNCTION
+  operator T () { return value[0]; }
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray & operator = ( const T & rhs )
+    {
+      for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray & operator = ( const StaticArray & rhs )
+    {
+      for ( unsigned i = 0 ; i < N ; ++i ) value[i] = rhs.value[i] ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray operator * ( const StaticArray & rhs )
+    {
+      StaticArray tmp ;
+      for ( unsigned i = 0 ; i < N ; ++i ) tmp.value[i] = value[i] * rhs.value[i] ;
+      return tmp ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray operator + ( const StaticArray & rhs )
+    {
+      StaticArray tmp ;
+      for ( unsigned i = 0 ; i < N ; ++i ) tmp.value[i] = value[i] + rhs.value[i] ;
+      return tmp ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  StaticArray & operator += ( const StaticArray & rhs )
+    {
+      for ( unsigned i = 0 ; i < N ; ++i ) value[i] += rhs.value[i] ;
+      return *this ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator += ( const volatile StaticArray & rhs ) volatile
+    {
+      for ( unsigned i = 0 ; i < N ; ++i ) value[i] += rhs.value[i] ;
+    }
+};
+
+template< typename T , class Space >
+struct DOT {
+  typedef T      value_type ;
+  typedef Space  execution_space ;
+
+  Kokkos::View< value_type * , Space > a ;
+  Kokkos::View< value_type * , Space > b ;
+
+  DOT( const Kokkos::View< value_type * , Space > arg_a
+     , const Kokkos::View< value_type * , Space > arg_b
+     )
+    : a( arg_a ), b( arg_b ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i , value_type & update ) const
+    {
+      update += a(i) * b(i);
+    }
+};
+
+template< typename T , class Space >
+struct FILL {
+  typedef T      value_type ;
+  typedef Space  execution_space ;
+
+  Kokkos::View< value_type * , Space > a ;
+  Kokkos::View< value_type * , Space > b ;
+
+  FILL( const Kokkos::View< value_type * , Space > & arg_a
+      , const Kokkos::View< value_type * , Space > & arg_b
+      )
+    : a( arg_a ), b( arg_b ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i ) const
+    {
+      a(i) = i % 2 ? i + 1 : 1 ;
+      b(i) = i % 2 ? 1 : i + 1 ;
+    }
+};
+
+template< class Space >
+void TestViewAggregateReduction()
+{
+  const int count = 2 ;
+  const long result = count % 2 ? ( count * ( ( count + 1 ) / 2 ) )
+                                : ( ( count / 2 ) * ( count + 1 ) );
+
+  Kokkos::View< long * , Space > a("a",count);
+  Kokkos::View< long * , Space > b("b",count);
+  Kokkos::View< StaticArray<long,4> * , Space > a4("a4",count);
+  Kokkos::View< StaticArray<long,4> * , Space > b4("b4",count);
+  Kokkos::View< StaticArray<long,10> * , Space > a10("a10",count);
+  Kokkos::View< StaticArray<long,10> * , Space > b10("b10",count);
+
+  Kokkos::parallel_for( count , FILL<long,Space>(a,b) );
+  Kokkos::parallel_for( count , FILL< StaticArray<long,4> , Space >(a4,b4) );
+  Kokkos::parallel_for( count , FILL< StaticArray<long,10> , Space >(a10,b10) );
+
+  long r = 0;
+  StaticArray<long,4> r4 ;
+  StaticArray<long,10> r10 ;
+
+  Kokkos::parallel_reduce( count , DOT<long,Space>(a,b) , r );
+  Kokkos::parallel_reduce( count , DOT< StaticArray<long,4> , Space >(a4,b4) , r4 );
+  Kokkos::parallel_reduce( count , DOT< StaticArray<long,10> , Space >(a10,b10) , r10 );
+
+  ASSERT_EQ( result , r );
+  for ( int i = 0 ; i < 10 ; ++i ) { ASSERT_EQ( result , r10.value[i] ); }
+  for ( int i = 0 ; i < 4 ; ++i ) { ASSERT_EQ( result , r4.value[i] ); }
+}
+
+}
+
+#endif /* #ifndef TEST_AGGREGATE_REDUCTION_HPP */
+
diff --git a/lib/kokkos/core/unit_test/TestAllocationTracker.cpp b/lib/kokkos/core/unit_test/TestAllocationTracker.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..371b0ac7588c7239ebf8a7f146faea63bc37faa2
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestAllocationTracker.cpp
@@ -0,0 +1,145 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <vector>
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_AllocationTracker.hpp>
+#include <impl/Kokkos_BasicAllocators.hpp>
+
+namespace Test {
+
+class alocation_tracker : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    Kokkos::initialize();
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::finalize();
+  }
+};
+
+TEST_F( alocation_tracker, simple)
+{
+  using namespace Kokkos::Impl;
+
+  {
+    AllocationTracker tracker;
+    EXPECT_FALSE( tracker.is_valid() );
+  }
+
+  // test ref count and label
+  {
+    int size = 100;
+    std::vector<AllocationTracker> trackers(size);
+
+    trackers[0] = AllocationTracker( MallocAllocator(), 128,"Test");
+
+    for (int i=0; i<size; ++i) {
+      trackers[i] = trackers[0];
+    }
+
+    EXPECT_EQ(100u, trackers[0].ref_count());
+    EXPECT_EQ(std::string("Test"), std::string(trackers[0].label()));
+  }
+
+
+  // test circular list
+  {
+    int num_allocs = 3000;
+    unsigned ref_count = 100;
+
+    std::vector<AllocationTracker> trackers(num_allocs);
+
+    for (int i=0; i<num_allocs; ++i) {
+      trackers[i] = AllocationTracker( MallocAllocator(), 128, "Test");
+      std::vector<AllocationTracker> ref_trackers(ref_count);
+      for (unsigned j=0; j<ref_count; ++j) {
+        ref_trackers[j] = trackers[i];
+      }
+      EXPECT_EQ( ref_count + 1u, trackers[i].ref_count() );
+    }
+
+    for (int i=0; i<num_allocs; ++i) {
+      EXPECT_EQ( 1u, trackers[i].ref_count() );
+    }
+  }
+}
+
+TEST_F( alocation_tracker, force_leaks)
+{
+// uncomment to force memory leaks
+#if 0
+  using namespace Kokkos::Impl;
+  Kokkos::kokkos_malloc("Forced Leak", 4096*10);
+  Kokkos::kokkos_malloc<Kokkos::HostSpace>("Forced Leak", 4096*10);
+#endif
+}
+
+TEST_F( alocation_tracker, disable_reference_counting)
+{
+  using namespace Kokkos::Impl;
+  // test ref count and label
+  {
+    int size = 100;
+    std::vector<AllocationTracker> trackers(size);
+
+    trackers[0] = AllocationTracker( MallocAllocator(), 128,"Test");
+
+    for (int i=1; i<size; ++i) {
+      trackers[i] = CopyWithoutTracking::apply(trackers[0]);
+    }
+
+    EXPECT_EQ(1u, trackers[0].ref_count());
+    EXPECT_EQ(std::string("Test"), std::string(trackers[0].label()));
+  }
+}
+
+} // namespace Test
diff --git a/lib/kokkos/core/unit_test/TestAtomic.hpp b/lib/kokkos/core/unit_test/TestAtomic.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..d273c287e8cb41b7dd836b3c72266f42d740bcbf
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestAtomic.hpp
@@ -0,0 +1,376 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+namespace TestAtomic {
+
+// Struct for testing arbitrary size atomics
+
+template<int N>
+struct SuperScalar {
+  double val[N];
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar() {
+    for(int i=0; i<N; i++)
+      val[i] = 0.0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar(const SuperScalar& src) {
+    for(int i=0; i<N; i++)
+      val[i] = src.val[i];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar(const volatile SuperScalar& src) {
+    for(int i=0; i<N; i++)
+      val[i] = src.val[i];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator = (const SuperScalar& src) {
+    for(int i=0; i<N; i++)
+      val[i] = src.val[i];
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator = (const volatile SuperScalar& src) {
+    for(int i=0; i<N; i++)
+      val[i] = src.val[i];
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  volatile SuperScalar& operator = (const SuperScalar& src) volatile  {
+    for(int i=0; i<N; i++)
+      val[i] = src.val[i];
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar operator + (const SuperScalar& src) {
+    SuperScalar tmp = *this;
+    for(int i=0; i<N; i++)
+      tmp.val[i] += src.val[i];
+    return tmp;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator += (const double& src) {
+    for(int i=0; i<N; i++)
+      val[i] += 1.0*(i+1)*src;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar& operator += (const SuperScalar& src) {
+    for(int i=0; i<N; i++)
+      val[i] += src.val[i];
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const SuperScalar& src) {
+    bool compare = true;
+    for(int i=0; i<N; i++)
+      compare = compare && ( val[i] == src.val[i]);
+    return compare;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const SuperScalar& src) {
+    bool compare = true;
+    for(int i=0; i<N; i++)
+      compare = compare && ( val[i] == src.val[i]);
+    return !compare;
+  }
+
+
+
+  KOKKOS_INLINE_FUNCTION
+  SuperScalar(const double& src) {
+    for(int i=0; i<N; i++)
+      val[i] = 1.0 * (i+1) * src;
+  }
+
+};
+
+template<int N>
+std::ostream& operator<<(std::ostream& os, const SuperScalar<N>& dt)
+{
+    os << "{ ";
+    for(int i=0;i<N-1;i++)
+       os << dt.val[i] << ", ";
+    os << dt.val[N-1] << "}";
+    return os;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ZeroFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<T,execution_space> type;
+  typedef typename Kokkos::View<T,execution_space>::HostMirror h_type;
+  type data;
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    data() = 0;
+  }
+};
+
+//---------------------------------------------------
+//--------------atomic_fetch_add---------------------
+//---------------------------------------------------
+
+template<class T,class DEVICE_TYPE>
+struct AddFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+    Kokkos::atomic_fetch_add(&data(),(T)1);
+  }
+};
+
+template<class T, class execution_space >
+T AddLoop(int loop) {
+  struct ZeroFunctor<T,execution_space> f_zero;
+  typename ZeroFunctor<T,execution_space>::type data("Data");
+  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  execution_space::fence();
+
+  struct AddFunctor<T,execution_space> f_add;
+  f_add.data = data;
+  Kokkos::parallel_for(loop,f_add);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+  return val;
+}
+
+template<class T>
+T AddLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++)
+  *data+=(T)1;
+
+  T val = *data;
+  delete data;
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct CASFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int) const {
+	  T old = data();
+	  T newval, assumed;
+	  do {
+	    assumed = old;
+	    newval = assumed + (T)1;
+	    old = Kokkos::atomic_compare_exchange(&data(), assumed, newval);
+	  }
+	  while( old != assumed );
+  }
+};
+
+template<class T, class execution_space >
+T CASLoop(int loop) {
+  struct ZeroFunctor<T,execution_space> f_zero;
+  typename ZeroFunctor<T,execution_space>::type data("Data");
+  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  execution_space::fence();
+
+  struct CASFunctor<T,execution_space> f_cas;
+  f_cas.data = data;
+  Kokkos::parallel_for(loop,f_cas);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  T val = h_data();
+
+  return val;
+}
+
+template<class T>
+T CASLoopSerial(int loop) {
+  T* data = new T[1];
+  data[0] = 0;
+
+  for(int i=0;i<loop;i++) {
+	  T assumed;
+	  T newval;
+	  T old;
+	  do {
+	    assumed = *data;
+	    newval = assumed + (T)1;
+	    old = *data;
+	    *data = newval;
+	  }
+	  while(!(assumed==old));
+  }
+
+  T val = *data;
+  delete data;
+  return val;
+}
+
+template<class T,class DEVICE_TYPE>
+struct ExchFunctor{
+  typedef DEVICE_TYPE execution_space;
+  typedef Kokkos::View<T,execution_space> type;
+  type data, data2;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+    T old = Kokkos::atomic_exchange(&data(),(T)i);
+    Kokkos::atomic_fetch_add(&data2(),old);
+  }
+};
+
+template<class T, class execution_space >
+T ExchLoop(int loop) {
+  struct ZeroFunctor<T,execution_space> f_zero;
+  typename ZeroFunctor<T,execution_space>::type data("Data");
+  typename ZeroFunctor<T,execution_space>::h_type h_data("HData");
+  f_zero.data = data;
+  Kokkos::parallel_for(1,f_zero);
+  execution_space::fence();
+
+  typename ZeroFunctor<T,execution_space>::type data2("Data");
+  typename ZeroFunctor<T,execution_space>::h_type h_data2("HData");
+  f_zero.data = data2;
+  Kokkos::parallel_for(1,f_zero);
+  execution_space::fence();
+
+  struct ExchFunctor<T,execution_space> f_exch;
+  f_exch.data = data;
+  f_exch.data2 = data2;
+  Kokkos::parallel_for(loop,f_exch);
+  execution_space::fence();
+
+  Kokkos::deep_copy(h_data,data);
+  Kokkos::deep_copy(h_data2,data2);
+  T val = h_data() + h_data2();
+
+  return val;
+}
+
+template<class T>
+T ExchLoopSerial(int loop) {
+  T* data = new T[1];
+  T* data2 = new T[1];
+  data[0] = 0;
+  data2[0] = 0;
+  for(int i=0;i<loop;i++) {
+	T old = *data;
+	*data=(T) i;
+	*data2+=old;
+  }
+
+  T val = *data2 + *data;
+  delete data;
+  delete data2;
+  return val;
+}
+
+template<class T, class DeviceType >
+T LoopVariant(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoop<T,DeviceType>(loop);
+    case 2: return CASLoop<T,DeviceType>(loop);
+    case 3: return ExchLoop<T,DeviceType>(loop);
+  }
+  return 0;
+}
+
+template<class T>
+T LoopVariantSerial(int loop, int test) {
+  switch (test) {
+    case 1: return AddLoopSerial<T>(loop);
+    case 2: return CASLoopSerial<T>(loop);
+    case 3: return ExchLoopSerial<T>(loop);
+  }
+  return 0;
+}
+
+template<class T,class DeviceType>
+bool Loop(int loop, int test)
+{
+  T res       = LoopVariant<T,DeviceType>(loop,test);
+  T resSerial = LoopVariantSerial<T>(loop,test);
+
+  bool passed = true;
+
+  if ( resSerial != res ) {
+    passed = false;
+
+    std::cout << "Loop<"
+              << typeid(T).name()
+              << ">( test = "
+              << test << " FAILED : "
+              << resSerial << " != " << res
+              << std::endl ;
+  }
+
+
+  return passed ;
+}
+
+}
+
diff --git a/lib/kokkos/core/unit_test/TestCXX11.hpp b/lib/kokkos/core/unit_test/TestCXX11.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..f48c76de508c1c828466955012dfaa76fb925866
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCXX11.hpp
@@ -0,0 +1,319 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#include <Kokkos_Core.hpp>
+
+namespace TestCXX11 {
+
+template<class DeviceType>
+struct FunctorAddTest{
+  typedef Kokkos::View<double**,DeviceType> view_type;
+  view_type a_, b_;
+  typedef DeviceType execution_space;
+  FunctorAddTest(view_type & a, view_type &b):a_(a),b_(b) {}
+  void operator() (const int& i) const {
+    b_(i,0) = a_(i,1) + a_(i,2);
+    b_(i,1) = a_(i,0) - a_(i,3);
+    b_(i,2) = a_(i,4) + a_(i,0);
+    b_(i,3) = a_(i,2) - a_(i,1);
+    b_(i,4) = a_(i,3) + a_(i,4);
+  }
+
+  typedef typename Kokkos::TeamPolicy< execution_space >::member_type  team_member ;
+  void operator() (const team_member & dev) const {
+    int i = dev.league_rank()*dev.team_size() + dev.team_rank();
+    b_(i,0) = a_(i,1) + a_(i,2);
+    b_(i,1) = a_(i,0) - a_(i,3);
+    b_(i,2) = a_(i,4) + a_(i,0);
+    b_(i,3) = a_(i,2) - a_(i,1);
+    b_(i,4) = a_(i,3) + a_(i,4);
+  }
+};
+
+template<class DeviceType, bool PWRTest>
+double AddTestFunctor() {
+
+  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
+
+  Kokkos::View<double**,DeviceType> a("A",100,5);
+  Kokkos::View<double**,DeviceType> b("B",100,5);
+  typename Kokkos::View<double**,DeviceType>::HostMirror h_a = Kokkos::create_mirror_view(a);
+  typename Kokkos::View<double**,DeviceType>::HostMirror h_b = Kokkos::create_mirror_view(b);
+
+  for(int i=0;i<100;i++) {
+    for(int j=0;j<5;j++)
+       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  }
+  Kokkos::deep_copy(a,h_a);
+
+  if(PWRTest==false)
+    Kokkos::parallel_for(100,FunctorAddTest<DeviceType>(a,b));
+  else
+    Kokkos::parallel_for(policy_type(25,4),FunctorAddTest<DeviceType>(a,b));
+  Kokkos::deep_copy(h_b,b);
+
+  double result = 0;
+  for(int i=0;i<100;i++) {
+      for(int j=0;j<5;j++)
+         result += h_b(i,j);
+    }
+
+  return result;
+}
+
+
+
+#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+template<class DeviceType, bool PWRTest>
+double AddTestLambda() {
+
+  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
+
+  Kokkos::View<double**,DeviceType> a("A",100,5);
+  Kokkos::View<double**,DeviceType> b("B",100,5);
+  typename Kokkos::View<double**,DeviceType>::HostMirror h_a = Kokkos::create_mirror_view(a);
+  typename Kokkos::View<double**,DeviceType>::HostMirror h_b = Kokkos::create_mirror_view(b);
+
+  for(int i=0;i<100;i++) {
+    for(int j=0;j<5;j++)
+       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  }
+  Kokkos::deep_copy(a,h_a);
+
+  if(PWRTest==false) {
+    Kokkos::parallel_for(100,[=](const int& i)  {
+      b(i,0) = a(i,1) + a(i,2);
+      b(i,1) = a(i,0) - a(i,3);
+      b(i,2) = a(i,4) + a(i,0);
+      b(i,3) = a(i,2) - a(i,1);
+      b(i,4) = a(i,3) + a(i,4);
+    });
+  } else {
+    typedef typename policy_type::member_type team_member ;
+    Kokkos::parallel_for(policy_type(25,4),[=](const team_member & dev)  {
+      int i = dev.league_rank()*dev.team_size() + dev.team_rank();
+      b(i,0) = a(i,1) + a(i,2);
+      b(i,1) = a(i,0) - a(i,3);
+      b(i,2) = a(i,4) + a(i,0);
+      b(i,3) = a(i,2) - a(i,1);
+      b(i,4) = a(i,3) + a(i,4);
+    });
+  }
+  Kokkos::deep_copy(h_b,b);
+
+  double result = 0;
+  for(int i=0;i<100;i++) {
+      for(int j=0;j<5;j++)
+         result += h_b(i,j);
+    }
+
+  return result;
+}
+
+#else
+template<class DeviceType, bool PWRTest>
+double AddTestLambda() {
+  return AddTestFunctor<DeviceType,PWRTest>();
+}
+#endif
+
+
+template<class DeviceType>
+struct FunctorReduceTest{
+  typedef Kokkos::View<double**,DeviceType> view_type;
+  view_type a_;
+  typedef DeviceType execution_space;
+  typedef double value_type;
+  FunctorReduceTest(view_type & a):a_(a) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int& i, value_type& sum) const {
+    sum += a_(i,1) + a_(i,2);
+    sum += a_(i,0) - a_(i,3);
+    sum += a_(i,4) + a_(i,0);
+    sum += a_(i,2) - a_(i,1);
+    sum += a_(i,3) + a_(i,4);
+  }
+
+  typedef typename Kokkos::TeamPolicy< execution_space >::member_type  team_member ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const team_member & dev, value_type& sum) const {
+    int i = dev.league_rank()*dev.team_size() + dev.team_rank();
+    sum += a_(i,1) + a_(i,2);
+    sum += a_(i,0) - a_(i,3);
+    sum += a_(i,4) + a_(i,0);
+    sum += a_(i,2) - a_(i,1);
+    sum += a_(i,3) + a_(i,4);
+  }
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type& update) const {update = 0.0;}
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& update, volatile value_type const& input) const {update += input;}
+};
+
+template<class DeviceType, bool PWRTest>
+double ReduceTestFunctor() {
+
+  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
+  typedef Kokkos::View<double**,DeviceType> view_type ;
+  typedef Kokkos::View<double,typename view_type::host_mirror_space,Kokkos::MemoryUnmanaged> unmanaged_result ;
+
+  view_type a("A",100,5);
+  typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a);
+
+  for(int i=0;i<100;i++) {
+    for(int j=0;j<5;j++)
+       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  }
+  Kokkos::deep_copy(a,h_a);
+
+  double result = 0.0;
+  if(PWRTest==false)
+    Kokkos::parallel_reduce(100,FunctorReduceTest<DeviceType>(a), unmanaged_result( & result ));
+  else
+    Kokkos::parallel_reduce(policy_type(25,4),FunctorReduceTest<DeviceType>(a), unmanaged_result( & result ));
+
+  return result;
+}
+
+#if defined (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
+template<class DeviceType, bool PWRTest>
+double ReduceTestLambda() {
+
+  typedef Kokkos::TeamPolicy<DeviceType> policy_type ;
+  typedef Kokkos::View<double**,DeviceType> view_type ;
+  typedef Kokkos::View<double,typename view_type::host_mirror_space,Kokkos::MemoryUnmanaged> unmanaged_result ;
+
+  view_type a("A",100,5);
+  typename view_type::HostMirror h_a = Kokkos::create_mirror_view(a);
+
+  for(int i=0;i<100;i++) {
+    for(int j=0;j<5;j++)
+       h_a(i,j) = 0.1*i/(1.1*j+1.0) + 0.5*j;
+  }
+  Kokkos::deep_copy(a,h_a);
+
+  double result = 0.0;
+
+  if(PWRTest==false) {
+    Kokkos::parallel_reduce(100,[=](const int& i, double& sum)  {
+      sum += a(i,1) + a(i,2);
+      sum += a(i,0) - a(i,3);
+      sum += a(i,4) + a(i,0);
+      sum += a(i,2) - a(i,1);
+      sum += a(i,3) + a(i,4);
+    }, unmanaged_result( & result ) );
+  } else {
+    typedef typename policy_type::member_type team_member ;
+    Kokkos::parallel_reduce(policy_type(25,4),[=](const team_member & dev, double& sum)  {
+      int i = dev.league_rank()*dev.team_size() + dev.team_rank();
+      sum += a(i,1) + a(i,2);
+      sum += a(i,0) - a(i,3);
+      sum += a(i,4) + a(i,0);
+      sum += a(i,2) - a(i,1);
+      sum += a(i,3) + a(i,4);
+    }, unmanaged_result( & result ) );
+  }
+
+  return result;
+}
+
+#else
+template<class DeviceType, bool PWRTest>
+double ReduceTestLambda() {
+  return ReduceTestFunctor<DeviceType,PWRTest>();
+}
+#endif
+
+template<class DeviceType>
+double TestVariantLambda(int test) {
+  switch (test) {
+    case 1: return AddTestLambda<DeviceType,false>();
+    case 2: return AddTestLambda<DeviceType,true>();
+    case 3: return ReduceTestLambda<DeviceType,false>();
+    case 4: return ReduceTestLambda<DeviceType,true>();
+  }
+  return 0;
+}
+
+
+template<class DeviceType>
+double TestVariantFunctor(int test) {
+  switch (test) {
+    case 1: return AddTestFunctor<DeviceType,false>();
+    case 2: return AddTestFunctor<DeviceType,true>();
+    case 3: return ReduceTestFunctor<DeviceType,false>();
+    case 4: return ReduceTestFunctor<DeviceType,true>();
+  }
+  return 0;
+}
+
+template<class DeviceType>
+bool Test(int test) {
+
+#ifdef KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA
+  double res_functor = TestVariantFunctor<DeviceType>(test);
+  double res_lambda = TestVariantLambda<DeviceType>(test);
+
+  char testnames[5][256] = {" "
+                            ,"AddTest","AddTest TeamPolicy"
+                            ,"ReduceTest","ReduceTest TeamPolicy"
+                           };
+  bool passed = true;
+
+  if ( res_functor != res_lambda ) {
+    passed = false;
+
+    std::cout << "CXX11 ( test = '"
+              << testnames[test] << "' FAILED : "
+              << res_functor << " != " << res_lambda
+              << std::endl ;
+  }
+
+  return passed ;
+#else
+  return true;
+#endif
+}
+
+}
diff --git a/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp b/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..9d20079b2fb13730feac99002a9c2590b6b800ff
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCXX11Deduction.hpp
@@ -0,0 +1,103 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+#include <Kokkos_Core.hpp>
+
+#ifndef TESTCXX11DEDUCTION_HPP
+#define TESTCXX11DEDUCTION_HPP
+
+namespace TestCXX11 {
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+struct TestReductionDeductionTagA {};
+struct TestReductionDeductionTagB {};
+
+template < class ExecSpace >
+struct TestReductionDeductionFunctor {
+
+  // KOKKOS_INLINE_FUNCTION
+  // void operator()( long i , long & value ) const
+  // { value += i + 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( TestReductionDeductionTagA , long i , long & value ) const
+  { value += ( 2 * i + 1 ) + ( 2 * i + 2 ); }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TestReductionDeductionTagB & , const long i , long & value ) const
+  { value += ( 3 * i + 1 ) + ( 3 * i + 2 ) + ( 3 * i + 3 ) ; }
+
+};
+
+template< class ExecSpace >
+void test_reduction_deduction()
+{
+  typedef TestReductionDeductionFunctor< ExecSpace > Functor ;
+
+  const long N = 50 ;
+  // const long answer  = N % 2 ? ( N * ((N+1)/2 )) : ( (N/2) * (N+1) );
+  const long answerA = N % 2 ? ( (2*N) * (((2*N)+1)/2 )) : ( ((2*N)/2) * ((2*N)+1) );
+  const long answerB = N % 2 ? ( (3*N) * (((3*N)+1)/2 )) : ( ((3*N)/2) * ((3*N)+1) );
+  long result = 0 ;
+
+  // Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace>(0,N) , Functor() , result );
+  // ASSERT_EQ( answer , result );
+  
+  Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,TestReductionDeductionTagA>(0,N) , Functor() , result );
+  ASSERT_EQ( answerA , result );
+  
+  Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,TestReductionDeductionTagB>(0,N) , Functor() , result );
+  ASSERT_EQ( answerB , result );
+}
+
+#else /* ! defined( KOKKOS_HAVE_CXX11 ) */
+
+template< class ExecSpace >
+void test_reduction_deduction() {}
+
+#endif /* ! defined( KOKKOS_HAVE_CXX11 ) */
+
+}
+
+#endif
+
diff --git a/lib/kokkos/core/unit_test/TestCompilerMacros.hpp b/lib/kokkos/core/unit_test/TestCompilerMacros.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..dfa2250c04ae8cc785383b1f64a127ad40279f57
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCompilerMacros.hpp
@@ -0,0 +1,93 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#define KOKKOS_PRAGMA_UNROLL(a)
+
+namespace TestCompilerMacros {
+
+template<class DEVICE_TYPE>
+struct AddFunctor {
+  typedef DEVICE_TYPE execution_space;
+  typedef typename Kokkos::View<int**,execution_space> type;
+  type a,b;
+  int length;
+
+  AddFunctor(type a_, type b_):a(a_),b(b_),length(a.dimension_1()) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()(int i) const {
+#ifdef KOKKOS_HAVE_PRAGMA_UNROLL
+    #pragma unroll
+#endif
+#ifdef KOKKOS_HAVE_PRAGMA_IVDEP
+    #pragma ivdep
+#endif
+#ifdef KOKKOS_HAVE_PRAGMA_VECTOR
+    #pragma vector always
+#endif
+#ifdef KOKKOS_HAVE_PRAGMA_LOOPCOUNT
+    #pragma loop count(128)
+#endif
+#ifdef KOKKOS_HAVE_PRAGMA_SIMD
+    #pragma simd
+#endif
+    for(int j=0;j<length;j++)
+      a(i,j) += b(i,j);
+  }
+};
+
+template<class DeviceType>
+bool Test() {
+  typedef typename Kokkos::View<int**,DeviceType> type;
+  type a("A",1024,128);
+  type b("B",1024,128);
+
+  AddFunctor<DeviceType> f(a,b);
+  Kokkos::parallel_for(1024,f);
+  DeviceType::fence();
+  return true;
+}
+
+}
diff --git a/lib/kokkos/core/unit_test/TestCuda.cpp b/lib/kokkos/core/unit_test/TestCuda.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..4a74d1f1836f3cd3160e683ccbeae41fb45f563a
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestCuda.cpp
@@ -0,0 +1,495 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <impl/Kokkos_ViewTileLeft.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestTile.hpp>
+
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestTeamVector.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+#include <TestCXX11Deduction.hpp>
+
+//----------------------------------------------------------------------------
+
+class cuda : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    Kokkos::Cuda::print_configuration( std::cout );
+    Kokkos::HostSpace::execution_space::initialize();
+    Kokkos::Cuda::initialize( Kokkos::Cuda::SelectDevice(0) );
+  }
+  static void TearDownTestCase()
+  {
+    Kokkos::Cuda::finalize();
+    Kokkos::HostSpace::execution_space::finalize();
+  }
+};
+
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+__global__
+void test_abort()
+{
+  Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<
+    Kokkos::CudaSpace ,
+    Kokkos::HostSpace >::verify();
+}
+
+__global__
+void test_cuda_spaces_int_value( int * ptr )
+{
+  if ( *ptr == 42 ) { *ptr = 2 * 42 ; }
+}
+
+
+TEST_F( cuda , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Cuda >() ) );
+}
+
+TEST_F( cuda , memory_space )
+{
+  TestMemorySpace< Kokkos::Cuda >();
+}
+
+TEST_F( cuda, spaces )
+{
+  if ( Kokkos::CudaUVMSpace::available() ) {
+
+    Kokkos::Impl::AllocationTracker tracker = Kokkos::CudaUVMSpace::allocate_and_track("uvm_ptr",sizeof(int));
+
+    int * uvm_ptr = (int*) tracker.alloc_ptr();
+
+    *uvm_ptr = 42 ;
+
+    Kokkos::Cuda::fence();
+    test_cuda_spaces_int_value<<<1,1>>>(uvm_ptr);
+    Kokkos::Cuda::fence();
+
+    EXPECT_EQ( *uvm_ptr, int(2*42) );
+
+  }
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( cuda , impl_shared_alloc )
+{
+  test_shared_alloc< Kokkos::CudaSpace , Kokkos::HostSpace::execution_space >();
+  test_shared_alloc< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >();
+  test_shared_alloc< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >();
+}
+
+TEST_F( cuda , impl_view_mapping )
+{
+  test_view_mapping< Kokkos::Cuda >();
+  test_view_mapping_subview< Kokkos::Cuda >();
+  test_view_mapping_operator< Kokkos::Cuda >();
+  TestViewMappingAtomic< Kokkos::Cuda >::run();
+}
+
+template< class MemSpace >
+struct TestViewCudaTexture {
+
+  enum { N = 1000 };
+
+  using V = Kokkos::Experimental::View<double*,MemSpace> ;
+  using T = Kokkos::Experimental::View<const double*, MemSpace, Kokkos::MemoryRandomAccess > ;
+
+  V m_base ;
+  T m_tex ;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagTest & , const int i , long & error_count ) const
+    { if ( m_tex[i] != i + 1 ) ++error_count ; }
+
+  TestViewCudaTexture()
+    : m_base("base",N)
+    , m_tex( m_base )
+    {}
+
+  static void run()
+    {
+      EXPECT_TRUE( ( std::is_same< typename V::reference_type
+                                 , double &
+                                 >::value ) );
+
+      EXPECT_TRUE( ( std::is_same< typename T::reference_type
+                                 , const double
+                                 >::value ) );
+
+      EXPECT_TRUE(  V::reference_type_is_lvalue_reference ); // An ordinary view
+      EXPECT_FALSE( T::reference_type_is_lvalue_reference ); // Texture fetch returns by value
+
+      TestViewCudaTexture self ;
+      Kokkos::parallel_for( Kokkos::RangePolicy< Kokkos::Cuda , TagInit >(0,N) , self );
+      long error_count = -1 ;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< Kokkos::Cuda , TagTest >(0,N) , self , error_count );
+      EXPECT_EQ( error_count , 0 );
+    }
+};
+
+
+TEST_F( cuda , impl_view_texture )
+{
+  TestViewCudaTexture< Kokkos::CudaSpace >::run();
+  TestViewCudaTexture< Kokkos::CudaUVMSpace >::run();
+}
+
+template< class MemSpace , class ExecSpace >
+struct TestViewCudaAccessible {
+
+  enum { N = 1000 };
+
+  using V = Kokkos::Experimental::View<double*,MemSpace> ;
+
+  V m_base ;
+
+  struct TagInit {};
+  struct TagTest {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit & , const int i ) const { m_base[i] = i + 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagTest & , const int i , long & error_count ) const
+    { if ( m_base[i] != i + 1 ) ++error_count ; }
+
+  TestViewCudaAccessible()
+    : m_base("base",N)
+    {}
+
+  static void run()
+    {
+      TestViewCudaAccessible self ;
+      Kokkos::parallel_for( Kokkos::RangePolicy< typename MemSpace::execution_space , TagInit >(0,N) , self );
+      MemSpace::execution_space::fence();
+      // Next access is a different execution space, must complete prior kernel.
+      long error_count = -1 ;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagTest >(0,N) , self , error_count );
+      EXPECT_EQ( error_count , 0 );
+    }
+};
+
+
+TEST_F( cuda , impl_view_accessible )
+{
+  TestViewCudaAccessible< Kokkos::CudaSpace , Kokkos::Cuda >::run();
+
+  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaUVMSpace , Kokkos::HostSpace::execution_space >::run();
+
+  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::Cuda >::run();
+  TestViewCudaAccessible< Kokkos::CudaHostPinnedSpace , Kokkos::HostSpace::execution_space >::run();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( cuda, view_impl )
+{
+  // test_abort<<<32,32>>>(); // Aborts the kernel with CUDA version 4.1 or greater
+
+  test_view_impl< Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_api )
+{
+  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess > > view_texture_managed ;
+  typedef Kokkos::View< const int * , Kokkos::Cuda , Kokkos::MemoryTraits< Kokkos::RandomAccess | Kokkos::Unmanaged > > view_texture_unmanaged ;
+
+  TestViewAPI< double , Kokkos::Cuda >();
+
+#if 0
+  Kokkos::View<double, Kokkos::Cuda > x("x");
+  Kokkos::View<double[1], Kokkos::Cuda > y("y");
+  // *x = 10 ;
+  // x() = 10 ;
+  // y[0] = 10 ;
+  // y(0) = 10 ;
+#endif
+}
+
+TEST_F( cuda, view_subview_auto_1d_left ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_subview_auto_1d_right ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_subview_auto_1d_stride ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_subview_assign_strided ) {
+  TestViewSubview::test_1d_strided_assignment< Kokkos::Cuda >();
+}
+
+TEST_F( cuda, view_subview_left_0 ) {
+  TestViewSubview::test_left_0< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_left_1 ) {
+  TestViewSubview::test_left_1< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_left_2 ) {
+  TestViewSubview::test_left_2< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_left_3 ) {
+  TestViewSubview::test_left_3< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_right_0 ) {
+  TestViewSubview::test_right_0< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_right_1 ) {
+  TestViewSubview::test_right_1< Kokkos::CudaUVMSpace >();
+}
+
+TEST_F( cuda, view_subview_right_3 ) {
+  TestViewSubview::test_right_3< Kokkos::CudaUVMSpace >();
+}
+
+
+
+
+TEST_F( cuda, range_tag )
+{
+  TestRange< Kokkos::Cuda >::test_for(1000);
+  TestRange< Kokkos::Cuda >::test_reduce(1000);
+  TestRange< Kokkos::Cuda >::test_scan(1000);
+}
+
+TEST_F( cuda, team_tag )
+{
+  TestTeamPolicy< Kokkos::Cuda >::test_for(1000);
+  TestTeamPolicy< Kokkos::Cuda >::test_reduce(1000);
+}
+
+TEST_F( cuda, reduce )
+{
+  TestReduce< long ,   Kokkos::Cuda >( 10000000 );
+  TestReduce< double , Kokkos::Cuda >( 1000000 );
+}
+
+TEST_F( cuda, reduce_team )
+{
+  TestReduceTeam< long ,   Kokkos::Cuda >( 10000000 );
+  TestReduceTeam< double , Kokkos::Cuda >( 1000000 );
+}
+
+TEST_F( cuda, shared_team )
+{
+  TestSharedTeam< Kokkos::Cuda >();
+}
+
+TEST_F( cuda, reduce_dynamic )
+{
+  TestReduceDynamic< long ,   Kokkos::Cuda >( 10000000 );
+  TestReduceDynamic< double , Kokkos::Cuda >( 1000000 );
+}
+
+TEST_F( cuda, reduce_dynamic_view )
+{
+  TestReduceDynamicView< long ,   Kokkos::Cuda >( 10000000 );
+  TestReduceDynamicView< double , Kokkos::Cuda >( 1000000 );
+}
+
+TEST_F( cuda, atomic )
+{
+  const int loop_count = 1e3 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Cuda>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Cuda>(100,3) ) );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( cuda, tile_layout)
+{
+  TestTile::test< Kokkos::Cuda , 1 , 1 >( 1 , 1 );
+  TestTile::test< Kokkos::Cuda , 1 , 1 >( 2 , 3 );
+  TestTile::test< Kokkos::Cuda , 1 , 1 >( 9 , 10 );
+
+  TestTile::test< Kokkos::Cuda , 2 , 2 >( 1 , 1 );
+  TestTile::test< Kokkos::Cuda , 2 , 2 >( 2 , 3 );
+  TestTile::test< Kokkos::Cuda , 2 , 2 >( 4 , 4 );
+  TestTile::test< Kokkos::Cuda , 2 , 2 >( 9 , 9 );
+
+  TestTile::test< Kokkos::Cuda , 2 , 4 >( 9 , 9 );
+  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
+
+  TestTile::test< Kokkos::Cuda , 4 , 4 >( 1 , 1 );
+  TestTile::test< Kokkos::Cuda , 4 , 4 >( 4 , 4 );
+  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 9 );
+  TestTile::test< Kokkos::Cuda , 4 , 4 >( 9 , 11 );
+
+  TestTile::test< Kokkos::Cuda , 8 , 8 >( 1 , 1 );
+  TestTile::test< Kokkos::Cuda , 8 , 8 >( 4 , 4 );
+  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 9 );
+  TestTile::test< Kokkos::Cuda , 8 , 8 >( 9 , 11 );
+}
+
+
+TEST_F( cuda , view_aggregate )
+{
+  TestViewAggregate< Kokkos::Cuda >();
+  TestViewAggregateReduction< Kokkos::Cuda >();
+}
+
+
+TEST_F( cuda , scan )
+{
+  TestScan< Kokkos::Cuda >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Cuda >( 1000000 );
+  TestScan< Kokkos::Cuda >( 10000000 );
+  Kokkos::Cuda::fence();
+}
+
+TEST_F( cuda , team_scan )
+{
+  TestScanTeam< Kokkos::Cuda >( 10 );
+  TestScanTeam< Kokkos::Cuda >( 10000 );
+}
+
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( cuda , template_meta_functions )
+{
+  TestTemplateMetaFunctions<int, Kokkos::Cuda >();
+}
+
+//----------------------------------------------------------------------------
+
+#ifdef KOKKOS_HAVE_CXX11
+
+namespace Test {
+
+TEST_F( cuda , reduction_deduction )
+{
+  TestCXX11::test_reduction_deduction< Kokkos::Cuda >();
+}
+
+TEST_F( cuda , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(4) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(5) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(6) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(7) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(8) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(9) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Cuda >(10) ) );
+}
+
+}
+#endif
+
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..d1a525f9e5952034295efba204d74e39b0461129
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceType.cpp
@@ -0,0 +1,250 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if !defined(KOKKOS_HAVE_CUDA) || defined(__CUDACC__)
+//----------------------------------------------------------------------------
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestTeam.hpp>
+#include <TestAggregate.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestCXX11.hpp>
+#include <TestTeamVector.hpp>
+
+namespace Test {
+
+class defaultdevicetype : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    Kokkos::initialize();
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::finalize();
+  }
+};
+
+
+TEST_F( defaultdevicetype, view_impl) {
+  test_view_impl< Kokkos::DefaultExecutionSpace >();
+}
+
+TEST_F( defaultdevicetype, view_api) {
+  TestViewAPI< double , Kokkos::DefaultExecutionSpace >();
+}
+
+TEST_F( defaultdevicetype, long_reduce) {
+  TestReduce< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+}
+
+TEST_F( defaultdevicetype, double_reduce) {
+  TestReduce< double ,   Kokkos::DefaultExecutionSpace >( 100000 );
+}
+
+TEST_F( defaultdevicetype, long_reduce_dynamic ) {
+  TestReduceDynamic< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+}
+
+TEST_F( defaultdevicetype, double_reduce_dynamic ) {
+  TestReduceDynamic< double ,   Kokkos::DefaultExecutionSpace >( 100000 );
+}
+
+TEST_F( defaultdevicetype, long_reduce_dynamic_view ) {
+  TestReduceDynamicView< long ,   Kokkos::DefaultExecutionSpace >( 100000 );
+}
+
+
+TEST_F( defaultdevicetype , atomics )
+{
+  const int loop_count = 1e4 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::DefaultExecutionSpace>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::DefaultExecutionSpace>(100,3) ) );
+}
+
+/*TEST_F( defaultdevicetype , view_remap )
+{
+  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3] ,
+                             Kokkos::LayoutRight ,
+                             Kokkos::DefaultExecutionSpace > output_type ;
+
+  typedef Kokkos::View< int**[N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::DefaultExecutionSpace > input_type ;
+
+  typedef Kokkos::View< int*[N0][N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::DefaultExecutionSpace > diff_type ;
+
+  output_type output( "output" , N0 );
+  input_type  input ( "input" , N0 , N1 );
+  diff_type   diff  ( "diff" , N0 );
+
+  int value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    input(i0,i1,i2,i3) = ++value ;
+  }}}}
+
+  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
+  Kokkos::deep_copy( output , input );
+
+  value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    ++value ;
+    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
+  }}}}
+}*/
+
+//----------------------------------------------------------------------------
+
+
+TEST_F( defaultdevicetype , view_aggregate )
+{
+  TestViewAggregate< Kokkos::DefaultExecutionSpace >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( defaultdevicetype , scan )
+{
+  TestScan< Kokkos::DefaultExecutionSpace >::test_range( 1 , 1000 );
+  TestScan< Kokkos::DefaultExecutionSpace >( 1000000 );
+  TestScan< Kokkos::DefaultExecutionSpace >( 10000000 );
+  Kokkos::DefaultExecutionSpace::fence();
+}
+
+
+TEST_F( defaultdevicetype , team_scan )
+{
+  TestScanTeam< Kokkos::DefaultExecutionSpace >( 10 );
+  TestScanTeam< Kokkos::DefaultExecutionSpace >( 10000 );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( defaultdevicetype , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::DefaultExecutionSpace >() ) );
+}
+
+
+//----------------------------------------------------------------------------
+#if defined (KOKKOS_HAVE_CXX11)
+TEST_F( defaultdevicetype , cxx11 )
+{
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(1) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(2) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(3) ) );
+  ASSERT_TRUE( ( TestCXX11::Test< Kokkos::DefaultExecutionSpace >(4) ) );
+}
+#endif
+
+#if defined (KOKKOS_HAVE_CXX11)
+TEST_F( defaultdevicetype , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(4) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::DefaultExecutionSpace >(5) ) );
+}
+#endif
+
+#if defined (KOKKOS_HAVE_CXX11)
+TEST_F( defaultdevicetype , malloc )
+{
+  int* data = (int*) Kokkos::kokkos_malloc(100*sizeof(int));
+  ASSERT_NO_THROW(data = (int*) Kokkos::kokkos_realloc(data,120*sizeof(int)));
+  Kokkos::kokkos_free(data);
+}
+#endif
+
+} // namespace test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..a1e3f8fb0adece50ce4f8f5e8b2204b66bb0fdc6
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestDefaultDeviceTypeInit.cpp
@@ -0,0 +1,390 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#ifdef KOKKOS_HAVE_OPENMP
+#include <omp.h>
+#endif
+
+#if !defined(KOKKOS_HAVE_CUDA) || defined(__CUDACC__)
+//----------------------------------------------------------------------------
+
+namespace Test {
+
+namespace Impl {
+
+  char** init_kokkos_args(bool do_threads,bool do_numa,bool do_device,bool do_other, int& nargs, Kokkos::InitArguments& init_args) {
+    nargs = (do_threads?1:0) +
+            (do_numa?1:0) +
+            (do_device?1:0) +
+            (do_other?4:0);
+    char** args_kokkos = new char*[nargs];
+    for(int i = 0; i < nargs; i++)
+      args_kokkos[i] = new char[20];
+
+    int threads_idx = do_other?1:0;
+    int numa_idx = (do_other?3:0) + (do_threads?1:0);
+    int device_idx = (do_other?3:0) + (do_threads?1:0) + (do_numa?1:0);
+
+
+    if(do_threads) {
+      int nthreads = 3;
+
+#ifdef KOKKOS_HAVE_OPENMP
+      if(omp_get_max_threads() < 3)
+        nthreads = omp_get_max_threads();
+#endif
+
+      if(Kokkos::hwloc::available())  {
+        if(Kokkos::hwloc::get_available_threads_per_core()<3)
+            nthreads =   Kokkos::hwloc::get_available_threads_per_core()
+                       * Kokkos::hwloc::get_available_numa_count();
+      }
+
+#ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
+         Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
+        nthreads = 1;
+      }
+#endif
+      init_args.num_threads = nthreads;
+      sprintf(args_kokkos[threads_idx],"--threads=%i",nthreads);
+    }
+
+    if(do_numa) {
+      int numa = 1;
+      if(Kokkos::hwloc::available())
+        numa = Kokkos::hwloc::get_available_numa_count();
+#ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
+         Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
+        numa = 1;
+      }
+#endif
+
+      init_args.num_numa = numa;
+      sprintf(args_kokkos[numa_idx],"--numa=%i",numa);
+    }
+
+    if(do_device) {
+
+      init_args.device_id = 0;
+      sprintf(args_kokkos[device_idx],"--device=%i",0);
+    }
+
+    if(do_other) {
+      sprintf(args_kokkos[0],"--dummyarg=1");
+      sprintf(args_kokkos[threads_idx+(do_threads?1:0)],"--dummy2arg");
+      sprintf(args_kokkos[threads_idx+(do_threads?1:0)+1],"dummy3arg");
+      sprintf(args_kokkos[device_idx+(do_device?1:0)],"dummy4arg=1");
+    }
+
+
+    return args_kokkos;
+  }
+
+  Kokkos::InitArguments init_initstruct(bool do_threads, bool do_numa, bool do_device) {
+    Kokkos::InitArguments args;
+
+    if(do_threads) {
+      int nthreads = 3;
+
+#ifdef KOKKOS_HAVE_OPENMP
+      if(omp_get_max_threads() < 3)
+        nthreads = omp_get_max_threads();
+#endif
+
+      if(Kokkos::hwloc::available())  {
+        if(Kokkos::hwloc::get_available_threads_per_core()<3)
+            nthreads =   Kokkos::hwloc::get_available_threads_per_core()
+                       * Kokkos::hwloc::get_available_numa_count();
+      }
+#ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
+         Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
+        nthreads = 1;
+      }
+#endif
+
+      args.num_threads = nthreads;
+    }
+
+    if(do_numa) {
+      int numa = 1;
+      if(Kokkos::hwloc::available())
+        numa = Kokkos::hwloc::get_available_numa_count();
+#ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultExecutionSpace>::value ||
+         Kokkos::Impl::is_same<Kokkos::Serial,Kokkos::DefaultHostExecutionSpace>::value ) {
+        numa = 1;
+      }
+#endif
+      args.num_numa = numa;
+    }
+
+    if(do_device) {
+      args.device_id = 0;
+    }
+
+    return args;
+  }
+
+  void check_correct_initialization(const Kokkos::InitArguments& argstruct) {
+    ASSERT_EQ( Kokkos::DefaultExecutionSpace::is_initialized(), 1);
+    ASSERT_EQ( Kokkos::HostSpace::execution_space::is_initialized(), 1);
+
+    //Figure out the number of threads the HostSpace ExecutionSpace should have initialized to
+    int expected_nthreads = argstruct.num_threads;
+    if(expected_nthreads<1) {
+      if(Kokkos::hwloc::available()) {
+        expected_nthreads = Kokkos::hwloc::get_available_numa_count()
+                          * Kokkos::hwloc::get_available_cores_per_numa()
+                          * Kokkos::hwloc::get_available_threads_per_core();
+      } else {
+        #ifdef KOKKOS_HAVE_OPENMP
+        if(Kokkos::Impl::is_same<Kokkos::HostSpace::execution_space,Kokkos::OpenMP>::value) {
+          expected_nthreads = omp_get_max_threads();
+        } else
+        #endif
+          expected_nthreads = 1;
+
+      }
+      #ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Serial>::value ||
+         Kokkos::Impl::is_same<Kokkos::DefaultHostExecutionSpace,Kokkos::Serial>::value ) 
+        expected_nthreads = 1;
+      #endif
+    }
+
+    int expected_numa = argstruct.num_numa;
+    if(expected_numa<1) {
+      if(Kokkos::hwloc::available()) {
+        expected_numa = Kokkos::hwloc::get_available_numa_count();
+      } else {
+        expected_numa = 1;
+      }
+      #ifdef KOKKOS_HAVE_SERIAL
+      if(Kokkos::Impl::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Serial>::value ||
+         Kokkos::Impl::is_same<Kokkos::DefaultHostExecutionSpace,Kokkos::Serial>::value )
+        expected_numa = 1;
+      #endif
+    }
+    ASSERT_EQ(Kokkos::HostSpace::execution_space::thread_pool_size(),expected_nthreads);
+
+#ifdef KOKKOS_HAVE_CUDA
+    if(Kokkos::Impl::is_same<Kokkos::DefaultExecutionSpace,Kokkos::Cuda>::value) {
+      int device;
+      cudaGetDevice( &device );
+      int expected_device = argstruct.device_id;
+      if(argstruct.device_id<0) {
+        expected_device = 0;
+      }
+      ASSERT_EQ(expected_device,device);
+    }
+#endif
+  }
+
+  //ToDo: Add check whether correct number of threads are actually started
+  void test_no_arguments() {
+    Kokkos::initialize();
+    check_correct_initialization(Kokkos::InitArguments());
+    Kokkos::finalize();
+  }
+
+  void test_commandline_args(int nargs, char** args, const Kokkos::InitArguments& argstruct) {
+    Kokkos::initialize(nargs,args);
+    check_correct_initialization(argstruct);
+    Kokkos::finalize();
+  }
+
+  void test_initstruct_args(const Kokkos::InitArguments& args) {
+    Kokkos::initialize(args);
+    check_correct_initialization(args);
+    Kokkos::finalize();
+  }
+}
+
+class defaultdevicetypeinit : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+  }
+
+  static void TearDownTestCase()
+  {
+  }
+};
+
+
+TEST_F( defaultdevicetypeinit, no_args) {
+  Impl::test_no_arguments();
+}
+
+TEST_F( defaultdevicetypeinit, commandline_args_empty) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(false,false,false,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+
+TEST_F( defaultdevicetypeinit, commandline_args_other) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(false,false,false,true,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(true,false,false,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(true,true,false,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(true,true,true,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_device) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(true,false,true,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+
+TEST_F( defaultdevicetypeinit, commandline_args_numa_device) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(false,true,true,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+
+TEST_F( defaultdevicetypeinit, commandline_args_device) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(false,false,true,false,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+
+TEST_F( defaultdevicetypeinit, commandline_args_nthreads_numa_device_other) {
+  Kokkos::InitArguments argstruct;
+  int nargs = 0;
+  char** args = Impl::init_kokkos_args(true,true,true,true,nargs, argstruct);
+  Impl::test_commandline_args(nargs,args,argstruct);
+  for(int i = 0; i < nargs; i++)
+    delete [] args[i];
+  delete [] args;
+}
+
+TEST_F( defaultdevicetypeinit, initstruct_default) {
+  Kokkos::InitArguments args;
+  Impl::test_initstruct_args(args);
+}
+
+TEST_F( defaultdevicetypeinit, initstruct_nthreads) {
+  Kokkos::InitArguments args = Impl::init_initstruct(true,false,false);
+  Impl::test_initstruct_args(args);
+}
+
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa) {
+  Kokkos::InitArguments args = Impl::init_initstruct(true,true,false);
+  Impl::test_initstruct_args(args);
+}
+
+TEST_F( defaultdevicetypeinit, initstruct_device) {
+  Kokkos::InitArguments args = Impl::init_initstruct(false,false,true);
+  Impl::test_initstruct_args(args);
+}
+
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_device) {
+  Kokkos::InitArguments args = Impl::init_initstruct(true,false,true);
+  Impl::test_initstruct_args(args);
+}
+
+
+TEST_F( defaultdevicetypeinit, initstruct_nthreads_numa_device) {
+  Kokkos::InitArguments args = Impl::init_initstruct(true,true,true);
+  Impl::test_initstruct_args(args);
+}
+
+
+
+} // namespace test
+
+#endif
diff --git a/lib/kokkos/core/unit_test/TestHWLOC.cpp b/lib/kokkos/core/unit_test/TestHWLOC.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..1637dec5de4ff762cfbd259ee47932b5e85eb4d0
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestHWLOC.cpp
@@ -0,0 +1,69 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <Kokkos_hwloc.hpp>
+
+namespace Test {
+
+class hwloc : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {}
+
+  static void TearDownTestCase()
+  {}
+};
+
+TEST_F( hwloc, query)
+{
+  std::cout << " NUMA[" << Kokkos::hwloc::get_available_numa_count() << "]"
+            << " CORE[" << Kokkos::hwloc::get_available_cores_per_numa() << "]"
+            << " PU[" << Kokkos::hwloc::get_available_threads_per_core()  << "]"
+            << std::endl ;
+}
+
+}
+
diff --git a/lib/kokkos/core/unit_test/TestMemorySpaceTracking.hpp b/lib/kokkos/core/unit_test/TestMemorySpaceTracking.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..80ffcc2afd81c102638f20a62365b3b2a071fc6e
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestMemorySpaceTracking.hpp
@@ -0,0 +1,100 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <iostream>
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace {
+
+template<class Arg1>
+class TestMemorySpace {
+public:
+
+  typedef typename Arg1::memory_space MemorySpace;
+  TestMemorySpace() { run_test(); }
+
+  void run_test()
+  {
+
+#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+    Kokkos::View<int* ,Arg1> invalid;
+    ASSERT_EQ(0u, invalid.tracker().ref_count() );
+
+    {
+      Kokkos::View<int* ,Arg1> a("A",10);
+
+      ASSERT_EQ(1u, a.tracker().ref_count() );
+
+      {
+        Kokkos::View<int* ,Arg1> b = a;
+        ASSERT_EQ(2u, b.tracker().ref_count() );
+
+        Kokkos::View<int* ,Arg1> D("D",10);
+        ASSERT_EQ(1u, D.tracker().ref_count() );
+
+        {
+          Kokkos::View<int* ,Arg1> E("E",10);
+          ASSERT_EQ(1u, E.tracker().ref_count() );
+        }
+
+        ASSERT_EQ(2u, b.tracker().ref_count() );
+      }
+      ASSERT_EQ(1u, a.tracker().ref_count() );
+    }
+
+#endif
+
+  }
+};
+
+}
+
+/*--------------------------------------------------------------------------*/
+
+
+
diff --git a/lib/kokkos/core/unit_test/TestOpenMP.cpp b/lib/kokkos/core/unit_test/TestOpenMP.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..8d4bcd1e2625330594fbe12997d5ea6fb5b98c20
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestOpenMP.cpp
@@ -0,0 +1,375 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+
+namespace Test {
+
+class openmp : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+    const unsigned threads_count = std::max( 1u , numa_count ) *
+                                   std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
+
+    Kokkos::OpenMP::initialize( threads_count );
+    Kokkos::OpenMP::print_configuration( std::cout , true );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::OpenMP::finalize();
+
+    omp_set_num_threads(1);
+
+    ASSERT_EQ( 1 , omp_get_max_threads() );
+  }
+};
+
+
+TEST_F( openmp , impl_shared_alloc ) {
+  test_shared_alloc< Kokkos::HostSpace , Kokkos::OpenMP >();
+}
+
+TEST_F( openmp , impl_view_mapping ) {
+  test_view_mapping< Kokkos::OpenMP >();
+  test_view_mapping_subview< Kokkos::OpenMP >();
+  test_view_mapping_operator< Kokkos::OpenMP >();
+  TestViewMappingAtomic< Kokkos::OpenMP >::run();
+}
+
+TEST_F( openmp, view_impl) {
+  test_view_impl< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_api) {
+  TestViewAPI< double , Kokkos::OpenMP >();
+}
+
+
+TEST_F( openmp, view_subview_auto_1d_left ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_auto_1d_right ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_auto_1d_stride ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_assign_strided ) {
+  TestViewSubview::test_1d_strided_assignment< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_left_0 ) {
+  TestViewSubview::test_left_0< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_left_1 ) {
+  TestViewSubview::test_left_1< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_left_2 ) {
+  TestViewSubview::test_left_2< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_left_3 ) {
+  TestViewSubview::test_left_3< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_right_0 ) {
+  TestViewSubview::test_right_0< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_right_1 ) {
+  TestViewSubview::test_right_1< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp, view_subview_right_3 ) {
+  TestViewSubview::test_right_3< Kokkos::OpenMP >();
+}
+
+
+
+TEST_F( openmp , range_tag )
+{
+  TestRange< Kokkos::OpenMP >::test_for(1000);
+  TestRange< Kokkos::OpenMP >::test_reduce(1000);
+  TestRange< Kokkos::OpenMP >::test_scan(1000);
+}
+
+TEST_F( openmp , team_tag )
+{
+  TestTeamPolicy< Kokkos::OpenMP >::test_for(1000);
+  TestTeamPolicy< Kokkos::OpenMP >::test_reduce(1000);
+}
+
+TEST_F( openmp, long_reduce) {
+  TestReduce< long ,   Kokkos::OpenMP >( 1000000 );
+}
+
+TEST_F( openmp, double_reduce) {
+  TestReduce< double ,   Kokkos::OpenMP >( 1000000 );
+}
+
+TEST_F( openmp, long_reduce_dynamic ) {
+  TestReduceDynamic< long ,   Kokkos::OpenMP >( 1000000 );
+}
+
+TEST_F( openmp, double_reduce_dynamic ) {
+  TestReduceDynamic< double ,   Kokkos::OpenMP >( 1000000 );
+}
+
+TEST_F( openmp, long_reduce_dynamic_view ) {
+  TestReduceDynamicView< long ,   Kokkos::OpenMP >( 1000000 );
+}
+
+TEST_F( openmp, team_long_reduce) {
+  TestReduceTeam< long ,   Kokkos::OpenMP >( 100000 );
+}
+
+TEST_F( openmp, team_double_reduce) {
+  TestReduceTeam< double ,   Kokkos::OpenMP >( 100000 );
+}
+
+TEST_F( openmp, team_shared_request) {
+  TestSharedTeam< Kokkos::OpenMP >();
+}
+
+
+TEST_F( openmp , atomics )
+{
+  const int loop_count = 1e4 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::OpenMP>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::OpenMP>(100,3) ) );
+
+#if defined( KOKKOS_ENABLE_ASM )
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::OpenMP>(100,3) ) );
+#endif
+}
+
+TEST_F( openmp , view_remap )
+{
+  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3] ,
+                             Kokkos::LayoutRight ,
+                             Kokkos::OpenMP > output_type ;
+
+  typedef Kokkos::View< int**[N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::OpenMP > input_type ;
+
+  typedef Kokkos::View< int*[N0][N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::OpenMP > diff_type ;
+
+  output_type output( "output" , N0 );
+  input_type  input ( "input" , N0 , N1 );
+  diff_type   diff  ( "diff" , N0 );
+
+  int value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    input(i0,i1,i2,i3) = ++value ;
+  }}}}
+
+  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
+  Kokkos::deep_copy( output , input );
+
+  value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    ++value ;
+    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
+  }}}}
+}
+
+//----------------------------------------------------------------------------
+
+
+TEST_F( openmp , view_aggregate )
+{
+  TestViewAggregate< Kokkos::OpenMP >();
+  TestViewAggregateReduction< Kokkos::OpenMP >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( openmp , scan )
+{
+  TestScan< Kokkos::OpenMP >::test_range( 1 , 1000 );
+  TestScan< Kokkos::OpenMP >( 1000000 );
+  TestScan< Kokkos::OpenMP >( 10000000 );
+  Kokkos::OpenMP::fence();
+}
+
+
+TEST_F( openmp , team_scan )
+{
+  TestScanTeam< Kokkos::OpenMP >( 10000 );
+  TestScanTeam< Kokkos::OpenMP >( 10000 );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( openmp , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::OpenMP >() ) );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( openmp , memory_space )
+{
+  TestMemorySpace< Kokkos::OpenMP >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( openmp , template_meta_functions )
+{
+  TestTemplateMetaFunctions<int, Kokkos::OpenMP >();
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_CXX11 ) && defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
+TEST_F( openmp , cxx11 )
+{
+  if ( Kokkos::Impl::is_same< Kokkos::DefaultExecutionSpace , Kokkos::OpenMP >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(1) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(2) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(3) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::OpenMP >(4) ) );
+  }
+}
+#endif
+
+#if defined (KOKKOS_HAVE_CXX11)
+TEST_F( openmp , reduction_deduction )
+{
+  TestCXX11::test_reduction_deduction< Kokkos::OpenMP >();
+}
+
+TEST_F( openmp , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(4) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(5) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(6) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(7) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(8) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(9) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::OpenMP >(10) ) );
+}
+#endif
+} // namespace test
+
diff --git a/lib/kokkos/core/unit_test/TestQthread.cpp b/lib/kokkos/core/unit_test/TestQthread.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..19bfa6bde4cc379370eee7501adc9926573580a5
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestQthread.cpp
@@ -0,0 +1,283 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <Kokkos_Qthread.hpp>
+
+#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestViewImpl.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestViewAPI.hpp>
+
+#include <TestTeam.hpp>
+#include <TestRange.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestTaskPolicy.hpp>
+// #include <TestTeamVector.hpp>
+
+namespace Test {
+
+class qthread : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+    int threads_count = std::max( 1u , numa_count )
+                      * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
+    Kokkos::Qthread::initialize( threads_count );
+    Kokkos::Qthread::print_configuration( std::cout , true );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Qthread::finalize();
+  }
+};
+
+TEST_F( qthread , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Qthread >() ) );
+}
+
+TEST_F( qthread, view_impl) {
+  test_view_impl< Kokkos::Qthread >();
+}
+
+TEST_F( qthread, view_api) {
+  TestViewAPI< double , Kokkos::Qthread >();
+}
+
+TEST_F( qthread , range_tag )
+{
+  TestRange< Kokkos::Qthread >::test_for(1000);
+  TestRange< Kokkos::Qthread >::test_reduce(1000);
+  TestRange< Kokkos::Qthread >::test_scan(1000);
+}
+
+TEST_F( qthread , team_tag )
+{
+  TestTeamPolicy< Kokkos::Qthread >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Qthread >::test_reduce( 1000 );
+}
+
+TEST_F( qthread, long_reduce) {
+  TestReduce< long ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, double_reduce) {
+  TestReduce< double ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, long_reduce_dynamic ) {
+  TestReduceDynamic< long ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, double_reduce_dynamic ) {
+  TestReduceDynamic< double ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, long_reduce_dynamic_view ) {
+  TestReduceDynamicView< long ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, team_long_reduce) {
+  TestReduceTeam< long ,   Kokkos::Qthread >( 1000000 );
+}
+
+TEST_F( qthread, team_double_reduce) {
+  TestReduceTeam< double ,   Kokkos::Qthread >( 1000000 );
+}
+
+
+TEST_F( qthread , atomics )
+{
+  const int loop_count = 1e4 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Qthread>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Qthread>(100,3) ) );
+
+#if defined( KOKKOS_ENABLE_ASM )
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Qthread>(100,3) ) );
+#endif
+
+}
+
+TEST_F( qthread , view_remap )
+{
+  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3] ,
+                             Kokkos::LayoutRight ,
+                             Kokkos::Qthread > output_type ;
+
+  typedef Kokkos::View< int**[N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Qthread > input_type ;
+
+  typedef Kokkos::View< int*[N0][N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Qthread > diff_type ;
+
+  output_type output( "output" , N0 );
+  input_type  input ( "input" , N0 , N1 );
+  diff_type   diff  ( "diff" , N0 );
+
+  int value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    input(i0,i1,i2,i3) = ++value ;
+  }}}}
+
+  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
+  Kokkos::deep_copy( output , input );
+
+  value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    ++value ;
+    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
+  }}}}
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthread , view_aggregate )
+{
+  TestViewAggregate< Kokkos::Qthread >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthread , scan )
+{
+  TestScan< Kokkos::Qthread >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Qthread >( 1000000 );
+  TestScan< Kokkos::Qthread >( 10000000 );
+  Kokkos::Qthread::fence();
+}
+
+TEST_F( qthread, team_shared ) {
+  TestSharedTeam< Kokkos::Qthread >();
+}
+
+TEST_F( qthread , team_scan )
+{
+  TestScanTeam< Kokkos::Qthread >( 10 );
+  TestScanTeam< Kokkos::Qthread >( 10000 );
+}
+
+#if defined (KOKKOS_HAVE_CXX11) && 0 /* disable */
+TEST_F( qthread , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Qthread >(4) ) );
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+TEST_F( qthread , task_policy )
+{
+  TestTaskPolicy::test_task_dep< Kokkos::Qthread >( 10 );
+  for ( long i = 0 ; i < 25 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Qthread >(i);
+  for ( long i = 0 ; i < 35 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Qthread >(i);
+}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+TEST_F( qthread , task_team )
+{
+  std::cout << "qthread.task_team test disabled due to unresolved error causing the test to hang." << std::endl ;
+  // TestTaskPolicy::test_task_team< Kokkos::Qthread >(1000);
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+} // namespace test
+
diff --git a/lib/kokkos/core/unit_test/TestRange.hpp b/lib/kokkos/core/unit_test/TestRange.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..1af53132723209831c3a28384523f539bc456720
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestRange.hpp
@@ -0,0 +1,171 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+namespace {
+
+template< class ExecSpace >
+struct TestRange {
+
+  typedef int value_type ; ///< typedef required for the parallel_reduce
+
+  typedef Kokkos::View<int*,ExecSpace> view_type ;
+
+  view_type m_flags ;
+
+  struct VerifyInitTag {};
+  struct ResetTag {};
+  struct VerifyResetTag {};
+
+  TestRange( const size_t N )
+    : m_flags( Kokkos::ViewAllocateWithoutInitializing("flags"), N )
+    {}
+
+  static void test_for( const size_t N )
+    {
+      TestRange functor(N);
+
+      typename view_type::HostMirror host_flags = Kokkos::create_mirror_view( functor.m_flags );
+
+      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace>(0,N) , functor );
+      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,VerifyInitTag>(0,N) , functor );
+
+      Kokkos::deep_copy( host_flags , functor.m_flags );
+
+      size_t error_count = 0 ;
+      for ( size_t i = 0 ; i < N ; ++i ) {
+        if ( int(i) != host_flags(i) ) ++error_count ;
+      }
+      ASSERT_EQ( error_count , size_t(0) );
+
+      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace,ResetTag>(0,N) , functor );
+      Kokkos::parallel_for( std::string("TestKernelFor") , Kokkos::RangePolicy<ExecSpace,VerifyResetTag>(0,N) , functor );
+
+      Kokkos::deep_copy( host_flags , functor.m_flags );
+
+      error_count = 0 ;
+      for ( size_t i = 0 ; i < N ; ++i ) {
+        if ( int(2*i) != host_flags(i) ) ++error_count ;
+      }
+      ASSERT_EQ( error_count , size_t(0) );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i ) const
+    { m_flags(i) = i ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const VerifyInitTag & , const int i ) const
+    { if ( i != m_flags(i) ) { printf("TestRange::test_for error at %d != %d\n",i,m_flags(i)); } }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const ResetTag & , const int i ) const
+    { m_flags(i) = 2 * m_flags(i); }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const VerifyResetTag & , const int i ) const
+    { if ( 2 * i != m_flags(i) ) { printf("TestRange::test_for error at %d != %d\n",i,m_flags(i)); } }
+
+  //----------------------------------------
+
+  struct OffsetTag {};
+
+  static void test_reduce( const size_t N )
+    {
+      TestRange functor(N);
+      int total = 0 ;
+
+      Kokkos::parallel_for(    Kokkos::RangePolicy<ExecSpace>(0,N) , functor );
+
+      Kokkos::parallel_reduce( "TestKernelReduce" , Kokkos::RangePolicy<ExecSpace>(0,N) , functor , total );
+      // sum( 0 .. N-1 )
+      ASSERT_EQ( size_t((N-1)*(N)/2) , size_t(total) );
+
+      Kokkos::parallel_reduce( Kokkos::RangePolicy<ExecSpace,OffsetTag>(0,N) , functor , total );
+      // sum( 1 .. N )
+      ASSERT_EQ( size_t((N)*(N+1)/2) , size_t(total) );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int i , value_type & update ) const
+    { update += m_flags(i); }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const OffsetTag & , const int i , value_type & update ) const
+    { update += 1 + m_flags(i); }
+
+  //----------------------------------------
+
+  static void test_scan( const size_t N )
+    {
+      TestRange functor(N);
+
+      Kokkos::parallel_for( Kokkos::RangePolicy<ExecSpace>(0,N) , functor );
+
+      Kokkos::parallel_scan( "TestKernelScan" , Kokkos::RangePolicy<ExecSpace,OffsetTag>(0,N) , functor );
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const OffsetTag & , const int i , value_type & update , bool final ) const
+    {
+      update += m_flags(i);
+
+      if ( final ) {
+        if ( update != (i*(i+1))/2 ) {
+          printf("TestRange::test_scan error %d : %d != %d\n",i,(i*(i+1))/2,m_flags(i));
+        }
+      }
+    }
+};
+
+} /* namespace */
+} /* namespace Test */
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestReduce.hpp b/lib/kokkos/core/unit_test/TestReduce.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..30b94d40fb43a854fc85352c7a779a32f4cf32ea
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestReduce.hpp
@@ -0,0 +1,371 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< typename ScalarType , class DeviceType >
+class ReduceFunctor
+{
+public:
+  typedef DeviceType  execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  struct value_type {
+    ScalarType value[3] ;
+  };
+
+  const size_type nwork ;
+
+  ReduceFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
+
+  ReduceFunctor( const ReduceFunctor & rhs )
+    : nwork( rhs.nwork ) {}
+
+/*
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & dst ) const
+  {
+    dst.value[0] = 0 ;
+    dst.value[1] = 0 ;
+    dst.value[2] = 0 ;
+  }
+*/
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst ,
+             const volatile value_type & src ) const
+  {
+    dst.value[0] += src.value[0] ;
+    dst.value[1] += src.value[1] ;
+    dst.value[2] += src.value[2] ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type iwork , value_type & dst ) const
+  {
+    dst.value[0] += 1 ;
+    dst.value[1] += iwork + 1 ;
+    dst.value[2] += nwork - iwork ;
+  }
+};
+
+template< class DeviceType >
+class ReduceFunctorFinal : public ReduceFunctor< long , DeviceType > {
+public:
+
+  typedef typename ReduceFunctor< long , DeviceType >::value_type value_type ;
+
+  ReduceFunctorFinal( const size_t n )
+    : ReduceFunctor<long,DeviceType>(n)
+    {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type & dst ) const
+  {
+    dst.value[0] = - dst.value[0] ;
+    dst.value[1] = - dst.value[1] ;
+    dst.value[2] = - dst.value[2] ;
+  }
+};
+
+template< typename ScalarType , class DeviceType >
+class RuntimeReduceFunctor
+{
+public:
+  // Required for functor:
+  typedef DeviceType  execution_space ;
+  typedef ScalarType  value_type[] ;
+  const unsigned      value_count ;
+
+
+  // Unit test details:
+
+  typedef typename execution_space::size_type  size_type ;
+
+  const size_type     nwork ;
+
+  RuntimeReduceFunctor( const size_type arg_nwork ,
+                        const size_type arg_count )
+    : value_count( arg_count )
+    , nwork( arg_nwork ) {}
+
+/*
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type dst ) const
+  {
+    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] = 0 ;
+  }
+*/
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile ScalarType dst[] ,
+             const volatile ScalarType src[] ) const
+  {
+    for ( unsigned i = 0 ; i < value_count ; ++i ) dst[i] += src[i] ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_type iwork , ScalarType dst[] ) const
+  {
+    const size_type tmp[3] = { 1 , iwork + 1 , nwork - iwork };
+
+    for ( size_type i = 0 ; i < value_count ; ++i ) {
+      dst[i] += tmp[ i % 3 ];
+    }
+  }
+};
+
+template< class DeviceType >
+class RuntimeReduceFunctorFinal : public RuntimeReduceFunctor< long , DeviceType > {
+public:
+
+  typedef RuntimeReduceFunctor< long , DeviceType > base_type ;
+  typedef typename base_type::value_type value_type ;
+  typedef long scalar_type ;
+
+  RuntimeReduceFunctorFinal( const size_t theNwork , const size_t count ) : base_type(theNwork,count) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void final( value_type dst ) const
+  {
+    for ( unsigned i = 0 ; i < base_type::value_count ; ++i ) {
+      dst[i] = - dst[i] ;
+    }
+  }
+};
+} // namespace Test
+
+namespace {
+
+template< typename ScalarType , class DeviceType >
+class TestReduce
+{
+public:
+  typedef DeviceType    execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  //------------------------------------
+
+  TestReduce( const size_type & nwork )
+  {
+    run_test(nwork);
+    run_test_final(nwork);
+  }
+
+  void run_test( const size_type & nwork )
+  {
+    typedef Test::ReduceFunctor< ScalarType , execution_space > functor_type ;
+    typedef typename functor_type::value_type value_type ;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    value_type result[ Repeat ];
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      Kokkos::parallel_reduce( nwork , functor_type(nwork) , result[i] );
+    }
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( (ScalarType) correct , result[i].value[j] );
+      }
+    }
+  }
+
+  void run_test_final( const size_type & nwork )
+  {
+    typedef Test::ReduceFunctorFinal< execution_space > functor_type ;
+    typedef typename functor_type::value_type value_type ;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    value_type result[ Repeat ];
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      Kokkos::parallel_reduce( nwork , functor_type(nwork) , result[i] );
+    }
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( (ScalarType) correct , - result[i].value[j] );
+      }
+    }
+  }
+};
+
+template< typename ScalarType , class DeviceType >
+class TestReduceDynamic
+{
+public:
+  typedef DeviceType    execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  //------------------------------------
+
+  TestReduceDynamic( const size_type nwork )
+  {
+    run_test_dynamic(nwork);
+    run_test_dynamic_final(nwork);
+  }
+
+  void run_test_dynamic( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceFunctor< ScalarType , execution_space > functor_type ;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    ScalarType result[ Repeat ][ Count ] ;
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      Kokkos::parallel_reduce( nwork , functor_type(nwork,Count) , result[i] );
+    }
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( (ScalarType) correct , result[i][j] );
+      }
+    }
+  }
+
+  void run_test_dynamic_final( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceFunctorFinal< execution_space > functor_type ;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    typename functor_type::scalar_type result[ Repeat ][ Count ] ;
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      Kokkos::parallel_reduce( "TestKernelReduce" , nwork , functor_type(nwork,Count) , result[i] );
+    }
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( (ScalarType) correct , - result[i][j] );
+      }
+    }
+  }
+};
+
+template< typename ScalarType , class DeviceType >
+class TestReduceDynamicView
+{
+public:
+  typedef DeviceType    execution_space ;
+  typedef typename execution_space::size_type size_type ;
+
+  //------------------------------------
+
+  TestReduceDynamicView( const size_type nwork )
+  {
+    run_test_dynamic_view(nwork);
+  }
+
+  void run_test_dynamic_view( const size_type nwork )
+  {
+    typedef Test::RuntimeReduceFunctor< ScalarType , execution_space > functor_type ;
+
+    typedef Kokkos::View< ScalarType* , DeviceType > result_type ;
+    typedef typename result_type::HostMirror result_host_type ;
+
+    const unsigned CountLimit = 23 ;
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    for ( unsigned count = 0 ; count < CountLimit ; ++count ) {
+
+      result_type result("result",count);
+      result_host_type host_result = Kokkos::create_mirror( result );
+
+      // Test result to host pointer:
+
+      std::string str("TestKernelReduce");
+      Kokkos::parallel_reduce( str , nw , functor_type(nw,count) , host_result.ptr_on_device() );
+
+      for ( unsigned j = 0 ; j < count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( host_result(j), (ScalarType) correct );
+        host_result(j) = 0 ;
+      }
+    }
+  }
+};
+
+}
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestScan.hpp b/lib/kokkos/core/unit_test/TestScan.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..eb5e833a1d1ddfddf89ed858d80144d38192c182
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestScan.hpp
@@ -0,0 +1,97 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+/*--------------------------------------------------------------------------*/
+
+#include <stdio.h>
+
+namespace Test {
+
+template< class Device , class WorkSpec = size_t >
+struct TestScan {
+
+  typedef  Device    execution_space ;
+  typedef  long int  value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int iwork , value_type & update , const bool final_pass ) const
+  {
+    const value_type n = iwork + 1 ;
+    const value_type imbalance = ( (1000 <= n) && (0 == n % 1000) ) ? 1000 : 0 ;
+
+    // Insert an artificial load imbalance
+
+    for ( value_type i = 0 ; i < imbalance ; ++i ) { ++update ; }
+
+    update += n - imbalance ;
+
+    if ( final_pass ) {
+      const value_type answer = n & 1 ? ( n * ( ( n + 1 ) / 2 ) ) : ( ( n / 2 ) * ( n + 1 ) );
+
+      if ( answer != update ) {
+        printf("TestScan(%d,%ld) != %ld\n",iwork,update,answer);
+      }
+    }
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & update ) const { update = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile       value_type & update ,
+             volatile const value_type & input ) const
+  { update += input ; }
+
+  TestScan( const WorkSpec & N )
+    { parallel_scan( N , *this ); }
+
+  static void test_range( const WorkSpec & begin , const WorkSpec & end )
+    {
+      for ( WorkSpec i = begin ; i < end ; ++i ) {
+        (void) TestScan( i );
+      }
+    }
+};
+
+}
+
diff --git a/lib/kokkos/core/unit_test/TestSerial.cpp b/lib/kokkos/core/unit_test/TestSerial.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..dbe94005e80e3b6ef80f6579135ffd199ba2bf26
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestSerial.cpp
@@ -0,0 +1,419 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+
+#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+#include <impl/Kokkos_ViewTileLeft.hpp>
+#include <TestTile.hpp>
+
+#endif
+
+#include <impl/Kokkos_Serial_TaskPolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestViewImpl.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewOfClass.hpp>
+#include <TestViewSubview.hpp>
+#include <TestAtomic.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestTaskPolicy.hpp>
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+
+namespace Test {
+
+class serial : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+    {
+      Kokkos::HostSpace::execution_space::initialize();
+    }
+  static void TearDownTestCase()
+    {
+      Kokkos::HostSpace::execution_space::finalize();
+    }
+};
+
+TEST_F( serial , impl_shared_alloc ) {
+  test_shared_alloc< Kokkos::HostSpace , Kokkos::Serial >();
+}
+
+TEST_F( serial , impl_view_mapping ) {
+  test_view_mapping< Kokkos::Serial >();
+  test_view_mapping_subview< Kokkos::Serial >();
+  test_view_mapping_operator< Kokkos::Serial >();
+  TestViewMappingAtomic< Kokkos::Serial >::run();
+}
+
+TEST_F( serial, view_impl) {
+  test_view_impl< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_api) {
+  TestViewAPI< double , Kokkos::Serial >();
+}
+
+TEST_F( serial , view_nested_view )
+{
+  ::Test::view_nested_view< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_auto_1d_left ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_auto_1d_right ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_auto_1d_stride ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_assign_strided ) {
+  TestViewSubview::test_1d_strided_assignment< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_left_0 ) {
+  TestViewSubview::test_left_0< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_left_1 ) {
+  TestViewSubview::test_left_1< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_left_2 ) {
+  TestViewSubview::test_left_2< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_left_3 ) {
+  TestViewSubview::test_left_3< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_right_0 ) {
+  TestViewSubview::test_right_0< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_right_1 ) {
+  TestViewSubview::test_right_1< Kokkos::Serial >();
+}
+
+TEST_F( serial, view_subview_right_3 ) {
+  TestViewSubview::test_right_3< Kokkos::Serial >();
+}
+
+TEST_F( serial , range_tag )
+{
+  TestRange< Kokkos::Serial >::test_for(1000);
+  TestRange< Kokkos::Serial >::test_reduce(1000);
+  TestRange< Kokkos::Serial >::test_scan(1000);
+}
+
+TEST_F( serial , team_tag )
+{
+  TestTeamPolicy< Kokkos::Serial >::test_for( 1000 );
+  TestTeamPolicy< Kokkos::Serial >::test_reduce( 1000 );
+}
+
+TEST_F( serial, long_reduce) {
+  TestReduce< long ,   Kokkos::Serial >( 1000000 );
+}
+
+TEST_F( serial, double_reduce) {
+  TestReduce< double ,   Kokkos::Serial >( 1000000 );
+}
+
+TEST_F( serial, long_reduce_dynamic ) {
+  TestReduceDynamic< long ,   Kokkos::Serial >( 1000000 );
+}
+
+TEST_F( serial, double_reduce_dynamic ) {
+  TestReduceDynamic< double ,   Kokkos::Serial >( 1000000 );
+}
+
+TEST_F( serial, long_reduce_dynamic_view ) {
+  TestReduceDynamicView< long ,   Kokkos::Serial >( 1000000 );
+}
+
+TEST_F( serial , scan )
+{
+  TestScan< Kokkos::Serial >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Serial >( 10 );
+  TestScan< Kokkos::Serial >( 10000 );
+}
+
+TEST_F( serial , team_long_reduce) {
+  TestReduceTeam< long ,   Kokkos::Serial >( 100000 );
+}
+
+TEST_F( serial , team_double_reduce) {
+  TestReduceTeam< double ,   Kokkos::Serial >( 100000 );
+}
+
+TEST_F( serial , team_shared_request) {
+  TestSharedTeam< Kokkos::Serial >();
+}
+
+TEST_F( serial  , team_scan )
+{
+  TestScanTeam< Kokkos::Serial >( 10 );
+  TestScanTeam< Kokkos::Serial >( 10000 );
+}
+
+
+TEST_F( serial , view_remap )
+{
+  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3] ,
+                             Kokkos::LayoutRight ,
+                             Kokkos::Serial > output_type ;
+
+  typedef Kokkos::View< int**[N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Serial > input_type ;
+
+  typedef Kokkos::View< int*[N0][N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Serial > diff_type ;
+
+  output_type output( "output" , N0 );
+  input_type  input ( "input" , N0 , N1 );
+  diff_type   diff  ( "diff" , N0 );
+
+  int value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    input(i0,i1,i2,i3) = ++value ;
+  }}}}
+
+  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
+  Kokkos::deep_copy( output , input );
+
+  value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    ++value ;
+    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
+  }}}}
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , view_aggregate )
+{
+  TestViewAggregate< Kokkos::Serial >();
+  TestViewAggregateReduction< Kokkos::Serial >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , atomics )
+{
+  const int loop_count = 1e6 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Serial>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Serial>(100,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Serial>(100,3) ) );
+}
+
+//----------------------------------------------------------------------------
+
+#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+TEST_F( serial, tile_layout )
+{
+  TestTile::test< Kokkos::Serial , 1 , 1 >( 1 , 1 );
+  TestTile::test< Kokkos::Serial , 1 , 1 >( 2 , 3 );
+  TestTile::test< Kokkos::Serial , 1 , 1 >( 9 , 10 );
+
+  TestTile::test< Kokkos::Serial , 2 , 2 >( 1 , 1 );
+  TestTile::test< Kokkos::Serial , 2 , 2 >( 2 , 3 );
+  TestTile::test< Kokkos::Serial , 2 , 2 >( 4 , 4 );
+  TestTile::test< Kokkos::Serial , 2 , 2 >( 9 , 9 );
+
+  TestTile::test< Kokkos::Serial , 2 , 4 >( 9 , 9 );
+  TestTile::test< Kokkos::Serial , 4 , 2 >( 9 , 9 );
+
+  TestTile::test< Kokkos::Serial , 4 , 4 >( 1 , 1 );
+  TestTile::test< Kokkos::Serial , 4 , 4 >( 4 , 4 );
+  TestTile::test< Kokkos::Serial , 4 , 4 >( 9 , 9 );
+  TestTile::test< Kokkos::Serial , 4 , 4 >( 9 , 11 );
+
+  TestTile::test< Kokkos::Serial , 8 , 8 >( 1 , 1 );
+  TestTile::test< Kokkos::Serial , 8 , 8 >( 4 , 4 );
+  TestTile::test< Kokkos::Serial , 8 , 8 >( 9 , 9 );
+  TestTile::test< Kokkos::Serial , 8 , 8 >( 9 , 11 );
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Serial >() ) );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , memory_space )
+{
+  TestMemorySpace< Kokkos::Serial >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , task_policy )
+{
+  TestTaskPolicy::test_task_dep< Kokkos::Serial >( 10 );
+  // TestTaskPolicy::test_norm2< Kokkos::Serial >( 1000 );
+  // for ( long i = 0 ; i < 30 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Serial >(i);
+  // for ( long i = 0 ; i < 40 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Serial >(i);
+  for ( long i = 0 ; i < 20 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Serial >(i);
+  for ( long i = 0 ; i < 25 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Serial >(i);
+}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+TEST_F( serial , task_team )
+{
+  TestTaskPolicy::test_task_team< Kokkos::Serial >(1000);
+}
+#endif
+
+//----------------------------------------------------------------------------
+
+TEST_F( serial , template_meta_functions )
+{
+  TestTemplateMetaFunctions<int, Kokkos::Serial >();
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_CXX11 ) && defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_SERIAL )
+TEST_F( serial , cxx11 )
+{
+  if ( Kokkos::Impl::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Serial >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(1) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(2) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(3) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Serial >(4) ) );
+  }
+}
+#endif
+
+#if defined (KOKKOS_HAVE_CXX11)
+TEST_F( serial , reduction_deduction )
+{
+  TestCXX11::test_reduction_deduction< Kokkos::Serial >();
+}
+
+TEST_F( serial , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(4) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(5) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(6) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(7) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(8) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(9) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Serial >(10) ) );
+}
+#endif
+
+} // namespace test
+
diff --git a/lib/kokkos/core/unit_test/TestSharedAlloc.hpp b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..060f5f4605d1b70e76918f05b103a24d778bcd59
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestSharedAlloc.hpp
@@ -0,0 +1,204 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+struct SharedAllocDestroy {
+
+  volatile int * count ;
+
+  SharedAllocDestroy() = default ;
+  SharedAllocDestroy( int * arg ) : count( arg ) {}
+
+  void destroy_shared_allocation()
+    {
+      Kokkos::atomic_fetch_add( count , 1 );
+    }
+
+};
+
+template< class MemorySpace , class ExecutionSpace >
+void test_shared_alloc()
+{
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+
+  typedef const Kokkos::Experimental::Impl::SharedAllocationHeader   Header ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationTracker  Tracker ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< void , void >                       RecordBase ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , void >                RecordMemS ;
+  typedef Kokkos::Experimental::Impl::SharedAllocationRecord< MemorySpace , SharedAllocDestroy >  RecordFull ;
+
+  static_assert( sizeof(Tracker) == sizeof(int*), "SharedAllocationTracker has wrong size!" );
+
+  MemorySpace s ;
+
+  const size_t N = 1200 ;
+  const size_t size = 8 ;
+
+  RecordMemS * rarray[ N ];
+  Header     * harray[ N ];
+
+  RecordMemS ** const r = rarray ;
+  Header     ** const h = harray ;
+
+  Kokkos::RangePolicy< ExecutionSpace > range(0,N);
+  
+  //----------------------------------------
+  {
+    Kokkos::parallel_for( range , [=]( size_t i ){
+      char name[64] ;
+      sprintf(name,"test_%.2d",int(i));
+
+      r[i] = RecordMemS::allocate( s , name , size * ( i + 1 ) );
+      h[i] = Header::get_header( r[i]->data() );
+
+      ASSERT_EQ( r[i]->use_count() , 0 );
+
+      for ( size_t j = 0 ; j < ( i / 10 ) + 1 ; ++j ) RecordBase::increment( r[i] );
+
+      ASSERT_EQ( r[i]->use_count() , ( i / 10 ) + 1 );
+      ASSERT_EQ( r[i] , RecordMemS::get_record( r[i]->data() ) );
+    });
+
+    // Sanity check for the whole set of allocation records to which this record belongs.
+    RecordBase::is_sane( r[0] );
+    // RecordMemS::print_records( std::cout , s , true );
+
+    Kokkos::parallel_for( range , [=]( size_t i ){
+      while ( 0 != ( r[i] = static_cast< RecordMemS *>( RecordBase::decrement( r[i] ) ) ) ) {
+        if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] );
+      }
+    });
+  }
+  //----------------------------------------
+  {
+    int destroy_count = 0 ;
+    SharedAllocDestroy counter( & destroy_count );
+
+    Kokkos::parallel_for( range , [=]( size_t i ){
+      char name[64] ;
+      sprintf(name,"test_%.2d",int(i));
+
+      RecordFull * rec = RecordFull::allocate( s , name , size * ( i + 1 ) );
+
+      rec->m_destroy = counter ;
+
+      r[i] = rec ;
+      h[i] = Header::get_header( r[i]->data() );
+
+      ASSERT_EQ( r[i]->use_count() , 0 );
+
+      for ( size_t j = 0 ; j < ( i / 10 ) + 1 ; ++j ) RecordBase::increment( r[i] );
+
+      ASSERT_EQ( r[i]->use_count() , ( i / 10 ) + 1 );
+      ASSERT_EQ( r[i] , RecordMemS::get_record( r[i]->data() ) );
+    });
+
+    RecordBase::is_sane( r[0] );
+
+    Kokkos::parallel_for( range , [=]( size_t i ){
+      while ( 0 != ( r[i] = static_cast< RecordMemS *>( RecordBase::decrement( r[i] ) ) ) ) {
+        if ( r[i]->use_count() == 1 ) RecordBase::is_sane( r[i] );
+      }
+    });
+
+    ASSERT_EQ( destroy_count , int(N) );
+  }
+
+  //----------------------------------------
+  {
+    int destroy_count = 0 ;
+
+    {
+      RecordFull * rec = RecordFull::allocate( s , "test" , size );
+
+      // ... Construction of the allocated { rec->data() , rec->size() }
+
+      // Copy destruction function object into the allocation record
+      rec->m_destroy = SharedAllocDestroy( & destroy_count );
+
+      // Start tracking, increments the use count from 0 to 1
+      Tracker track( rec );
+
+      ASSERT_EQ( rec->use_count() , 1 );
+
+      // Verify construction / destruction increment
+      for ( size_t i = 0 ; i < N ; ++i ) {
+        ASSERT_EQ( rec->use_count() , 1 );
+        {
+          Tracker local_tracker( rec );
+          ASSERT_EQ( rec->use_count() , 2 );
+        }
+        ASSERT_EQ( rec->use_count() , 1 );
+      }
+
+      Kokkos::parallel_for( range , [=]( size_t i ){
+        Tracker local_tracker( rec );
+        ASSERT_GT( rec->use_count() , 1 );
+      });
+
+      ASSERT_EQ( rec->use_count() , 1 );
+
+      // Destruction of 'track' object deallocates the 'rec' and invokes the destroy function object.
+    }
+
+    ASSERT_EQ( destroy_count , 1 );
+  }
+
+#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
+
+}
+
+
+}
+
diff --git a/lib/kokkos/core/unit_test/TestTaskPolicy.hpp b/lib/kokkos/core/unit_test/TestTaskPolicy.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..96a5ca3b01208e485a887aed9a7dce8d547f31fb
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTaskPolicy.hpp
@@ -0,0 +1,494 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+
+#ifndef KOKKOS_UNITTEST_TASKPOLICY_HPP
+#define KOKKOS_UNITTEST_TASKPOLICY_HPP
+
+#include <stdio.h>
+#include <iostream>
+#include <cmath>
+#include <Kokkos_TaskPolicy.hpp>
+
+namespace TestTaskPolicy {
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+struct FibChild {
+
+  typedef long value_type ;
+
+  Kokkos::Experimental::TaskPolicy<ExecSpace> policy ;
+  const value_type n ;
+  int has_nested ;
+
+  FibChild( const Kokkos::Experimental::TaskPolicy<ExecSpace> & arg_policy
+          , const value_type arg_n )
+    : policy(arg_policy,2) /* default dependence capacity = 2 */
+    , n( arg_n ), has_nested(0) {}
+
+  inline
+  void apply( value_type & result )
+    {
+      if ( n < 2 ) {
+
+        has_nested = -1 ;
+
+        result = n ;
+      }
+      else {
+        if ( has_nested == 0 ) {
+          // Spawn new children and respawn myself to sum their results:
+          has_nested = 2 ;
+
+          Kokkos::Experimental::respawn
+            ( policy
+            , this
+            , Kokkos::Experimental::spawn( policy , FibChild(policy,n-1) )
+            , Kokkos::Experimental::spawn( policy , FibChild(policy,n-2) )
+            );
+
+        }
+        else if ( has_nested == 2 ) {
+
+          has_nested = -1 ;
+
+          const Kokkos::Experimental::Future<long,ExecSpace> fib_1 = policy.get_dependence(this,0);
+          const Kokkos::Experimental::Future<long,ExecSpace> fib_2 = policy.get_dependence(this,1);
+
+          result = fib_1.get() + fib_2.get();
+        }
+        else {
+          fprintf(stderr,"FibChild(%ld) execution error\n",(long)n);
+          fflush(stderr);
+        }
+      }
+    }
+};
+
+template< class ExecSpace >
+struct FibChild2 {
+
+  typedef long value_type ;
+
+  Kokkos::Experimental::TaskPolicy<ExecSpace> policy ;
+  const value_type n ;
+  int has_nested ;
+
+  FibChild2( const Kokkos::Experimental::TaskPolicy<ExecSpace> & arg_policy
+           , const value_type arg_n )
+    : policy(arg_policy,2) /* default dependence capacity = 2 */
+    , n( arg_n ), has_nested(0) {}
+
+  inline
+  void apply( value_type & result )
+    {
+      if ( 0 == has_nested ) {
+        if ( n < 2 ) {
+
+          has_nested = -1 ;
+
+          result = n ;
+        }
+        else if ( n < 4 ) {
+          // Spawn new children and respawn myself to sum their results:
+          // result = Fib(n-1) + Fib(n-2)
+          has_nested = 2 ;
+          // Kokkos::respawn implements the following steps:
+          policy.clear_dependence( this );
+          policy.add_dependence( this , Kokkos::Experimental::spawn( policy , FibChild2(policy,n-1) ) );
+          policy.add_dependence( this , Kokkos::Experimental::spawn( policy , FibChild2(policy,n-2) ) );
+          policy.respawn( this );
+        }
+        else {
+          // Spawn new children and respawn myself to sum their results:
+          // result = Fib(n-1) + Fib(n-2)
+          // result = ( Fib(n-2) + Fib(n-3) ) + ( Fib(n-3) + Fib(n-4) )
+          // result = ( ( Fib(n-3) + Fib(n-4) ) + Fib(n-3) ) + ( Fib(n-3) + Fib(n-4) )
+          // result = 3 * Fib(n-3) + 2 * Fib(n-4)
+          has_nested = 4 ;
+          // Kokkos::Experimental::respawn implements the following steps:
+          policy.clear_dependence( this );
+          policy.add_dependence( this , Kokkos::Experimental::spawn( policy , FibChild2(policy,n-3) ) );
+          policy.add_dependence( this , Kokkos::Experimental::spawn( policy , FibChild2(policy,n-4) ) );
+          policy.respawn( this );
+        }
+     }
+     else if ( 2 == has_nested || 4 == has_nested ) {
+        const Kokkos::Experimental::Future<long,ExecSpace> fib_a = policy.get_dependence(this,0);
+        const Kokkos::Experimental::Future<long,ExecSpace> fib_b = policy.get_dependence(this,1);
+
+        result = ( has_nested == 2 ) ? fib_a.get() + fib_b.get()
+                                     : 3 * fib_a.get() + 2 * fib_b.get() ;
+
+        has_nested = -1 ;
+      }
+      else {
+        fprintf(stderr,"FibChild2(%ld) execution error\n",(long)n);
+        fflush(stderr);
+      }
+    }
+};
+
+namespace {
+
+long eval_fib( long n )
+{
+  if ( n < 2 ) return n ;
+
+  std::vector<long> fib(n+1);
+
+  fib[0] = 0 ;
+  fib[1] = 1 ;
+
+  for ( long i = 2 ; i <= n ; ++i ) { fib[i] = fib[i-2] + fib[i-1]; }
+
+  return fib[n];
+}
+
+}
+
+template< class ExecSpace >
+void test_fib( long n )
+{
+  Kokkos::Experimental::TaskPolicy<ExecSpace> policy(2);
+
+  Kokkos::Experimental::Future<long,ExecSpace> f = Kokkos::Experimental::spawn( policy , FibChild<ExecSpace>(policy,n) );
+
+  Kokkos::Experimental::wait( policy );
+
+  if ( f.get() != eval_fib(n) ) {
+    std::cout << "Fib(" << n << ") = " << f.get();
+    std::cout << " != " << eval_fib(n);
+    std::cout << std::endl ;
+  }
+}
+
+template< class ExecSpace >
+void test_fib2( long n )
+{
+  Kokkos::Experimental::TaskPolicy<ExecSpace> policy(2); // default dependence capacity
+
+  Kokkos::Experimental::Future<long,ExecSpace> f = Kokkos::Experimental::spawn( policy , FibChild2<ExecSpace>(policy,n) );
+
+  Kokkos::Experimental::wait( policy );
+
+  if ( f.get() != eval_fib(n) ) {
+    std::cout << "Fib2(" << n << ") = " << f.get();
+    std::cout << " != " << eval_fib(n);
+    std::cout << std::endl ;
+  }
+}
+
+//----------------------------------------------------------------------------
+
+template< class ExecSpace >
+struct Norm2 {
+
+  typedef double value_type ;
+
+  const double * const m_x ;
+
+  Norm2( const double * x ) : m_x(x) {}
+
+  inline
+  void init( double & val ) const { val = 0 ; }
+
+  inline
+  void operator()( int i , double & val ) const { val += m_x[i] * m_x[i] ; }
+
+  void apply( double & dst ) const { dst = std::sqrt( dst ); }
+};
+
+template< class ExecSpace >
+void test_norm2( const int n )
+{
+  Kokkos::Experimental::TaskPolicy< ExecSpace > policy ;
+
+  double * const x = new double[n];
+
+  for ( int i = 0 ; i < n ; ++i ) x[i] = 1 ;
+
+  Kokkos::RangePolicy<ExecSpace> r(0,n);
+
+  Kokkos::Experimental::Future<double,ExecSpace> f = Kokkos::Experimental::spawn_reduce( policy , r , Norm2<ExecSpace>(x) );
+
+  Kokkos::Experimental::wait( policy );
+
+#if defined(PRINT)
+  std::cout << "Norm2: " << f.get() << std::endl ;
+#endif
+
+  delete[] x ;
+}
+
+//----------------------------------------------------------------------------
+
+template< class Space >
+struct TaskDep {
+
+  typedef int value_type ;
+  typedef Kokkos::Experimental::TaskPolicy< Space > policy_type ;
+
+  const policy_type policy ;
+  const int         input ;
+
+  TaskDep( const policy_type & arg_p , const int arg_i )
+    : policy( arg_p ), input( arg_i ) {}
+
+  void apply( int & val )
+  {
+    val = input ;
+    const int num = policy.get_dependence( this );
+
+    for ( int i = 0 ; i < num ; ++i ) {
+      Kokkos::Experimental::Future<int,Space> f = policy.get_dependence( this , i );
+      val += f.get();
+    }
+  }
+};
+
+
+template< class Space >
+void test_task_dep( const int n )
+{
+  enum { NTEST = 64 };
+
+  Kokkos::Experimental::TaskPolicy< Space > policy ;
+
+  Kokkos::Experimental::Future<int,Space> f[ NTEST ];
+
+  for ( int i = 0 ; i < NTEST ; ++i ) {
+    // Create task in the "constructing" state with capacity for 'n+1' dependences
+    f[i] = policy.create( TaskDep<Space>(policy,0) , n + 1 );
+
+    if ( f[i].get_task_state() != Kokkos::Experimental::TASK_STATE_CONSTRUCTING ) {
+      Kokkos::Impl::throw_runtime_exception("get_task_state() != Kokkos::Experimental::TASK_STATE_CONSTRUCTING");
+    }
+
+    // Only use 'n' dependences
+
+    for ( int j = 0 ; j < n ; ++j ) {
+
+      Kokkos::Experimental::Future<int,Space> nested = policy.create( TaskDep<Space>(policy,j+1) );
+
+      policy.spawn( nested );
+
+      // Add dependence to a "constructing" task
+      policy.add_dependence( f[i] , nested );
+    }
+
+    // Spawn task from the "constructing" to the "waiting" state
+    policy.spawn( f[i] );
+  }
+
+  const int answer = n % 2 ? n * ( ( n + 1 ) / 2 ) : ( n / 2 ) * ( n + 1 );
+
+  Kokkos::Experimental::wait( policy );
+
+  int error = 0 ;
+  for ( int i = 0 ; i < NTEST ; ++i ) {
+    if ( f[i].get_task_state() != Kokkos::Experimental::TASK_STATE_COMPLETE ) {
+      Kokkos::Impl::throw_runtime_exception("get_task_state() != Kokkos::Experimental::TASK_STATE_COMPLETE");
+    }
+    if ( answer != f[i].get() && 0 == error ) {
+      std::cout << "test_task_dep(" << n << ") ERROR at[" << i << "]"
+                << " answer(" << answer << ") != result(" << f[i].get() << ")" << std::endl ;
+    }
+  }
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_CXX11 )
+
+template< class ExecSpace >
+struct TaskTeam {
+
+  enum { SPAN = 8 };
+
+  typedef void value_type ;
+  typedef Kokkos::Experimental::TaskPolicy<ExecSpace>  policy_type ;
+  typedef Kokkos::Experimental::Future<ExecSpace>      future_type ;
+  typedef Kokkos::View<long*,ExecSpace>                view_type ;
+
+  policy_type  policy ;
+  future_type  future ;
+
+  view_type  result ;
+  const long nvalue ;
+
+  TaskTeam( const policy_type & arg_policy
+          , const view_type   & arg_result
+          , const long          arg_nvalue )
+    : policy(arg_policy)
+    , future()
+    , result( arg_result )
+    , nvalue( arg_nvalue )
+    {}
+
+  inline
+  void apply( const typename policy_type::member_type & member )
+    {
+      const long end   = nvalue + 1 ;
+      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
+
+      if ( 0 < begin && future.get_task_state() == Kokkos::Experimental::TASK_STATE_NULL ) {
+        if ( member.team_rank() == 0 ) {
+          future = policy.spawn( policy.create_team( TaskTeam( policy , result , begin - 1 ) ) );
+          policy.clear_dependence( this );
+          policy.add_dependence( this , future );
+          policy.respawn( this );
+        }
+        return ;
+      }
+
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { result[i] = i + 1 ; }
+                          );
+    }
+};
+
+template< class ExecSpace >
+struct TaskTeamValue {
+
+  enum { SPAN = 8 };
+
+  typedef long value_type ;
+  typedef Kokkos::Experimental::TaskPolicy<ExecSpace>         policy_type ;
+  typedef Kokkos::Experimental::Future<value_type,ExecSpace>  future_type ;
+  typedef Kokkos::View<long*,ExecSpace>                       view_type ;
+
+  policy_type  policy ;
+  future_type  future ;
+
+  view_type  result ;
+  const long nvalue ;
+
+  TaskTeamValue( const policy_type & arg_policy
+               , const view_type   & arg_result
+               , const long          arg_nvalue )
+    : policy(arg_policy)
+    , future()
+    , result( arg_result )
+    , nvalue( arg_nvalue )
+    {}
+
+  inline
+  void apply( const typename policy_type::member_type & member , value_type & final )
+    {
+      const long end   = nvalue + 1 ;
+      const long begin = 0 < end - SPAN ? end - SPAN : 0 ;
+
+      if ( 0 < begin && future.get_task_state() == Kokkos::Experimental::TASK_STATE_NULL ) {
+        if ( member.team_rank() == 0 ) {
+          future = policy.spawn( policy.create_team( TaskTeamValue( policy , result , begin - 1 ) ) );
+          policy.clear_dependence( this );
+          policy.add_dependence( this , future );
+          policy.respawn( this );
+        }
+        return ;
+      }
+
+      Kokkos::parallel_for( Kokkos::TeamThreadRange(member,begin,end)
+                          , [&]( int i ) { result[i] = i + 1 ; }
+                          );
+
+      if ( member.team_rank() == 0 ) {
+        final = result[nvalue] ;
+      }
+
+      Kokkos::memory_fence();
+    }
+};
+
+template< class ExecSpace >
+void test_task_team( long n )
+{
+  typedef TaskTeam< ExecSpace >            task_type ;
+  typedef TaskTeamValue< ExecSpace >       task_value_type ;
+  typedef typename task_type::view_type    view_type ;
+  typedef typename task_type::policy_type  policy_type ;
+
+  typedef typename task_type::future_type        future_type ;
+  typedef typename task_value_type::future_type  future_value_type ;
+
+  policy_type  policy ;
+  view_type    result("result",n+1);
+
+  future_type f = policy.spawn( policy.create_team( task_type( policy , result , n ) ) );
+
+  Kokkos::Experimental::wait( policy );
+
+  for ( long i = 0 ; i <= n ; ++i ) {
+    const long answer = i + 1 ;
+    if ( result(i) != answer ) {
+      std::cerr << "test_task_team void ERROR result(" << i << ") = " << result(i) << " != " << answer << std::endl ;
+    }
+  }
+
+  future_value_type fv = policy.spawn( policy.create_team( task_value_type( policy , result , n ) ) );
+
+  Kokkos::Experimental::wait( policy );
+
+  if ( fv.get() != n + 1 ) {
+    std::cerr << "test_task_team value ERROR future = " << fv.get() << " != " << n + 1 << std::endl ;
+  }
+  for ( long i = 0 ; i <= n ; ++i ) {
+    const long answer = i + 1 ;
+    if ( result(i) != answer ) {
+      std::cerr << "test_task_team value ERROR result(" << i << ") = " << result(i) << " != " << answer << std::endl ;
+    }
+  }
+}
+
+#endif
+
+//----------------------------------------------------------------------------
+
+} // namespace TestTaskPolicy
+
+#endif /* #ifndef KOKKOS_UNITTEST_TASKPOLICY_HPP */
+
+
diff --git a/lib/kokkos/core/unit_test/TestTeam.hpp b/lib/kokkos/core/unit_test/TestTeam.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..4849f18dfbac209252d5d2ddde8e0d8dfc98ac7d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTeam.hpp
@@ -0,0 +1,466 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <stdio.h>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+namespace {
+
+template< class ExecSpace >
+struct TestTeamPolicy {
+
+  typedef typename Kokkos::TeamPolicy< ExecSpace >::member_type team_member ;
+  typedef Kokkos::View<int**,ExecSpace> view_type ;
+
+  view_type m_flags ;
+
+  TestTeamPolicy( const size_t league_size )
+    : m_flags( Kokkos::ViewAllocateWithoutInitializing("flags")
+             , Kokkos::TeamPolicy< ExecSpace >::team_size_max( *this )
+             , league_size )
+    {}
+
+  struct VerifyInitTag {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_member & member ) const
+    {
+      const int tid = member.team_rank() + member.team_size() * member.league_rank();
+
+      m_flags( member.team_rank() , member.league_rank() ) = tid ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const VerifyInitTag & , const team_member & member ) const
+    {
+      const int tid = member.team_rank() + member.team_size() * member.league_rank();
+
+      if ( tid != m_flags( member.team_rank() , member.league_rank() ) ) {
+        printf("TestTeamPolicy member(%d,%d) error %d != %d\n"
+              , member.league_rank() , member.team_rank()
+              , tid , m_flags( member.team_rank() , member.league_rank() ) );
+      }
+    }
+
+  static void test_for( const size_t league_size )
+    {
+      TestTeamPolicy functor( league_size );
+
+      const int team_size = Kokkos::TeamPolicy< ExecSpace >::team_size_max( functor );
+
+      Kokkos::parallel_for( Kokkos::TeamPolicy< ExecSpace >( league_size , team_size ) , functor );
+      Kokkos::parallel_for( Kokkos::TeamPolicy< ExecSpace , VerifyInitTag >( league_size , team_size ) , functor );
+    }
+
+  struct ReduceTag {};
+
+  typedef long value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const team_member & member , value_type & update ) const
+    {
+      update += member.team_rank() + member.team_size() * member.league_rank();
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const ReduceTag & , const team_member & member , value_type & update ) const
+    {
+      update += 1 + member.team_rank() + member.team_size() * member.league_rank();
+    }
+
+  static void test_reduce( const size_t league_size )
+    {
+      TestTeamPolicy functor( league_size );
+
+      const int team_size = Kokkos::TeamPolicy< ExecSpace >::team_size_max( functor );
+      const long N = team_size * league_size ;
+
+      long total = 0 ;
+
+      Kokkos::parallel_reduce( Kokkos::TeamPolicy< ExecSpace >( league_size , team_size ) , functor , total );
+      ASSERT_EQ( size_t((N-1)*(N))/2 , size_t(total) );
+
+      Kokkos::parallel_reduce( Kokkos::TeamPolicy< ExecSpace , ReduceTag >( league_size , team_size ) , functor , total );
+      ASSERT_EQ( (size_t(N)*size_t(N+1))/2 , size_t(total) );
+    }
+};
+
+}
+}
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< typename ScalarType , class DeviceType >
+class ReduceTeamFunctor
+{
+public:
+  typedef DeviceType execution_space ;
+  typedef Kokkos::TeamPolicy< execution_space >  policy_type ;
+  typedef typename execution_space::size_type        size_type ;
+
+  struct value_type {
+    ScalarType value[3] ;
+  };
+
+  const size_type nwork ;
+
+  ReduceTeamFunctor( const size_type & arg_nwork ) : nwork( arg_nwork ) {}
+
+  ReduceTeamFunctor( const ReduceTeamFunctor & rhs )
+    : nwork( rhs.nwork ) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & dst ) const
+  {
+    dst.value[0] = 0 ;
+    dst.value[1] = 0 ;
+    dst.value[2] = 0 ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( volatile value_type & dst ,
+             const volatile value_type & src ) const
+  {
+    dst.value[0] += src.value[0] ;
+    dst.value[1] += src.value[1] ;
+    dst.value[2] += src.value[2] ;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type ind , value_type & dst ) const
+  {
+    const int thread_rank = ind.team_rank() + ind.team_size() * ind.league_rank();
+    const int thread_size = ind.team_size() * ind.league_size();
+    const int chunk = ( nwork + thread_size - 1 ) / thread_size ;
+
+    size_type iwork = chunk * thread_rank ;
+    const size_type iwork_end = iwork + chunk < nwork ? iwork + chunk : nwork ;
+
+    for ( ; iwork < iwork_end ; ++iwork ) {
+      dst.value[0] += 1 ;
+      dst.value[1] += iwork + 1 ;
+      dst.value[2] += nwork - iwork ;
+    }
+  }
+};
+
+} // namespace Test
+
+namespace {
+
+template< typename ScalarType , class DeviceType >
+class TestReduceTeam
+{
+public:
+  typedef DeviceType    execution_space ;
+  typedef Kokkos::TeamPolicy< execution_space >  policy_type ;
+  typedef typename execution_space::size_type    size_type ;
+
+  //------------------------------------
+
+  TestReduceTeam( const size_type & nwork )
+  {
+    run_test(nwork);
+  }
+
+  void run_test( const size_type & nwork )
+  {
+    typedef Test::ReduceTeamFunctor< ScalarType , execution_space > functor_type ;
+    typedef typename functor_type::value_type value_type ;
+    typedef Kokkos::View< value_type, Kokkos::HostSpace, Kokkos::MemoryUnmanaged > result_type ;
+
+    enum { Count = 3 };
+    enum { Repeat = 100 };
+
+    value_type result[ Repeat ];
+
+    const unsigned long nw   = nwork ;
+    const unsigned long nsum = nw % 2 ? nw * (( nw + 1 )/2 )
+                                      : (nw/2) * ( nw + 1 );
+
+    const unsigned team_size   = policy_type::team_size_recommended( functor_type(nwork) );
+    const unsigned league_size = ( nwork + team_size - 1 ) / team_size ;
+
+    policy_type team_exec( league_size , team_size );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      result_type tmp( & result[i] );
+      Kokkos::parallel_reduce( team_exec , functor_type(nwork) , tmp );
+    }
+
+    execution_space::fence();
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      for ( unsigned j = 0 ; j < Count ; ++j ) {
+        const unsigned long correct = 0 == j % 3 ? nw : nsum ;
+        ASSERT_EQ( (ScalarType) correct , result[i].value[j] );
+      }
+    }
+  }
+};
+
+}
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class DeviceType >
+class ScanTeamFunctor
+{
+public:
+  typedef DeviceType  execution_space ;
+  typedef Kokkos::TeamPolicy< execution_space >  policy_type ;
+
+  typedef long int    value_type ;
+  Kokkos::View< value_type , execution_space > accum ;
+  Kokkos::View< value_type , execution_space > total ;
+
+  ScanTeamFunctor() : accum("accum"), total("total") {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init( value_type & error ) const { error = 0 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void join( value_type volatile & error ,
+             value_type volatile const & input ) const
+    { if ( input ) error = 1 ; }
+
+  struct JoinMax {
+    typedef long int value_type ;
+    KOKKOS_INLINE_FUNCTION
+    void join( value_type volatile & dst
+             , value_type volatile const & input ) const
+      { if ( dst < input ) dst = input ; }
+  };
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type ind , value_type & error ) const
+  {
+    if ( 0 == ind.league_rank() && 0 == ind.team_rank() ) {
+      const long int thread_count = ind.league_size() * ind.team_size();
+      total() = ( thread_count * ( thread_count + 1 ) ) / 2 ;
+    }
+
+    // Team max:
+    const int long m = ind.team_reduce( (long int) ( ind.league_rank() + ind.team_rank() ) , JoinMax() );
+
+    if ( m != ind.league_rank() + ( ind.team_size() - 1 ) ) {
+      printf("ScanTeamFunctor[%d.%d of %d.%d] reduce_max_answer(%ld) != reduce_max(%ld)\n"
+            , ind.league_rank(), ind.team_rank()
+            , ind.league_size(), ind.team_size()
+            , (long int)(ind.league_rank() + ( ind.team_size() - 1 )) , m );
+    }
+
+    // Scan:
+    const long int answer =
+      ( ind.league_rank() + 1 ) * ind.team_rank() +
+      ( ind.team_rank() * ( ind.team_rank() + 1 ) ) / 2 ;
+
+    const long int result =
+      ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
+
+    const long int result2 =
+      ind.team_scan( ind.league_rank() + 1 + ind.team_rank() + 1 );
+
+    if ( answer != result || answer != result2 ) {
+      printf("ScanTeamFunctor[%d.%d of %d.%d] answer(%ld) != scan_first(%ld) or scan_second(%ld)\n",
+             ind.league_rank(), ind.team_rank(),
+             ind.league_size(), ind.team_size(),
+             answer,result,result2);
+      error = 1 ;
+    }
+
+    const long int thread_rank = ind.team_rank() +
+                                 ind.team_size() * ind.league_rank();
+    ind.team_scan( 1 + thread_rank , accum.ptr_on_device() );
+  }
+};
+
+template< class DeviceType >
+class TestScanTeam
+{
+public:
+  typedef DeviceType  execution_space ;
+  typedef long int    value_type ;
+
+  typedef Kokkos::TeamPolicy< execution_space > policy_type ;
+  typedef Test::ScanTeamFunctor<DeviceType> functor_type ;
+
+  //------------------------------------
+
+  TestScanTeam( const size_t nteam )
+  {
+    run_test(nteam);
+  }
+
+  void run_test( const size_t nteam )
+  {
+    typedef Kokkos::View< long int , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
+
+    const unsigned REPEAT = 100000 ;
+    const unsigned Repeat = ( REPEAT + nteam - 1 ) / nteam ;
+
+    functor_type functor ;
+
+    policy_type team_exec( nteam , policy_type::team_size_max( functor ) );
+
+    for ( unsigned i = 0 ; i < Repeat ; ++i ) {
+      long int accum = 0 ;
+      long int total = 0 ;
+      long int error = 0 ;
+      Kokkos::deep_copy( functor.accum , total );
+      Kokkos::parallel_reduce( team_exec , functor , result_type( & error ) );
+      DeviceType::fence();
+      Kokkos::deep_copy( accum , functor.accum );
+      Kokkos::deep_copy( total , functor.total );
+
+      ASSERT_EQ( error , 0 );
+      ASSERT_EQ( total , accum );
+    }
+
+    execution_space::fence();
+  }
+};
+
+} // namespace Test
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class ExecSpace >
+struct SharedTeamFunctor {
+
+  typedef ExecSpace  execution_space ;
+  typedef int        value_type ;
+  typedef Kokkos::TeamPolicy< execution_space >  policy_type ;
+
+  enum { SHARED_COUNT = 1000 };
+
+  typedef typename ExecSpace::scratch_memory_space shmem_space ;
+
+  // tbd: MemoryUnmanaged should be the default for shared memory space
+  typedef Kokkos::View<int*,shmem_space,Kokkos::MemoryUnmanaged> shared_int_array_type ;
+
+  // Tell how much shared memory will be required by this functor:
+  inline
+  unsigned team_shmem_size( int /* team_size */ ) const
+  {
+    return shared_int_array_type::shmem_size( SHARED_COUNT ) +
+           shared_int_array_type::shmem_size( SHARED_COUNT );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const typename policy_type::member_type & ind , value_type & update ) const
+  {
+    const shared_int_array_type shared_A( ind.team_shmem() , SHARED_COUNT );
+    const shared_int_array_type shared_B( ind.team_shmem() , SHARED_COUNT );
+
+    if ((shared_A.ptr_on_device () == NULL && SHARED_COUNT > 0) ||
+        (shared_B.ptr_on_device () == NULL && SHARED_COUNT > 0)) {
+      printf ("Failed to allocate shared memory of size %lu\n",
+              static_cast<unsigned long> (SHARED_COUNT));
+      ++update; // failure to allocate is an error
+    }
+    else {
+      for ( int i = ind.team_rank() ; i < SHARED_COUNT ; i += ind.team_size() ) {
+        shared_A[i] = i + ind.league_rank();
+        shared_B[i] = 2 * i + ind.league_rank();
+      }
+
+      ind.team_barrier();
+
+      if ( ind.team_rank() + 1 == ind.team_size() ) {
+        for ( int i = 0 ; i < SHARED_COUNT ; ++i ) {
+          if ( shared_A[i] != i + ind.league_rank() ) {
+            ++update ;
+          }
+          if ( shared_B[i] != 2 * i + ind.league_rank() ) {
+            ++update ;
+          }
+        }
+      }
+    }
+  }
+};
+
+}
+
+namespace {
+
+template< class ExecSpace >
+struct TestSharedTeam {
+
+  TestSharedTeam()
+  { run(); }
+
+  void run()
+  {
+    typedef Test::SharedTeamFunctor<ExecSpace> Functor ;
+    typedef Kokkos::View< typename Functor::value_type , Kokkos::HostSpace , Kokkos::MemoryUnmanaged >  result_type ;
+
+    const size_t team_size = Kokkos::TeamPolicy< ExecSpace >::team_size_max( Functor() );
+
+    Kokkos::TeamPolicy< ExecSpace > team_exec( 8192 / team_size , team_size );
+
+    typename Functor::value_type error_count = 0 ;
+
+    Kokkos::parallel_reduce( team_exec , Functor() , result_type( & error_count ) );
+
+    ASSERT_EQ( error_count , 0 );
+  }
+};
+
+}
+
+/*--------------------------------------------------------------------------*/
diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..add8b7ed4578a40b964f688f3ef02d93fb1a1cc5
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp
@@ -0,0 +1,650 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#include <impl/Kokkos_Timer.hpp>
+#include <iostream>
+#include <cstdlib>
+
+namespace TestTeamVector {
+
+struct my_complex {
+  double re,im;
+  int dummy;
+  KOKKOS_INLINE_FUNCTION
+  my_complex() {
+    re = 0.0;
+    im = 0.0;
+    dummy = 0;
+  }
+  KOKKOS_INLINE_FUNCTION
+  my_complex(const my_complex& src) {
+    re = src.re;
+    im = src.im;
+    dummy = src.dummy;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex(const volatile my_complex& src) {
+    re = src.re;
+    im = src.im;
+    dummy = src.dummy;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  my_complex(const double& val) {
+    re = val;
+    im = 0.0;
+    dummy = 0;
+  }
+  KOKKOS_INLINE_FUNCTION
+  my_complex& operator += (const my_complex& src) {
+    re += src.re;
+    im += src.im;
+    dummy += src.dummy;
+    return *this;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator += (const volatile my_complex& src) volatile {
+    re += src.re;
+    im += src.im;
+    dummy += src.dummy;
+  }
+  KOKKOS_INLINE_FUNCTION
+  my_complex& operator *= (const my_complex& src) {
+    double re_tmp = re*src.re - im*src.im;
+    double im_tmp = re * src.im + im * src.re;
+    re = re_tmp;
+    im = im_tmp;
+    dummy *= src.dummy;
+    return *this;
+  }
+  KOKKOS_INLINE_FUNCTION
+  void operator *= (const volatile my_complex& src) volatile {
+    double re_tmp = re*src.re - im*src.im;
+    double im_tmp = re * src.im + im * src.re;
+    re = re_tmp;
+    im = im_tmp;
+    dummy *= src.dummy;
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator == (const my_complex& src) {
+    return (re == src.re) && (im == src.im) && ( dummy == src.dummy );
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const my_complex& src) {
+      return (re != src.re) || (im != src.im) || ( dummy != src.dummy );
+  }
+  KOKKOS_INLINE_FUNCTION
+  bool operator != (const double& val) {
+    return (re != val) ||
+           (im != 0) || (dummy != 0);
+  }
+  KOKKOS_INLINE_FUNCTION
+  my_complex& operator= (const int& val) {
+    re = val;
+    im = 0.0;
+    dummy = 0;
+    return *this;
+  }
+  KOKKOS_INLINE_FUNCTION
+  my_complex& operator= (const double& val) {
+    re = val;
+    im = 0.0;
+    dummy = 0;
+    return *this;
+  }
+  KOKKOS_INLINE_FUNCTION
+  operator double() {
+    return re;
+  }
+};
+
+#if defined (KOKKOS_HAVE_CXX11)
+
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_for {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
+    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
+    typedef typename shared_int::size_type size_type;
+
+    const size_type shmemSize = team.team_size () * 13;
+    shared_int values = shared_int (team.team_shmem (), shmemSize);
+
+    if (values.ptr_on_device () == NULL || values.dimension_0 () < shmemSize) {
+      printf ("FAILED to allocate shared memory of size %u\n",
+              static_cast<unsigned int> (shmemSize));
+    }
+    else {
+
+      // Initialize shared memory
+      values(team.team_rank ()) = 0;
+
+      // Accumulate value into per thread shared memory
+      // This is non blocking
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,131),[&] (int i) {
+        values(team.team_rank ()) += i - team.league_rank () + team.league_size () + team.team_size ();
+      });
+      // Wait for all memory to be written
+      team.team_barrier ();
+      // One thread per team executes the comparison
+      Kokkos::single(Kokkos::PerTeam(team),[&]() {
+            Scalar test = 0;
+            Scalar value = 0;
+            for (int i = 0; i < 131; ++i) {
+              test += i - team.league_rank () + team.league_size () + team.team_size ();
+            }
+            for (int i = 0; i < team.team_size (); ++i) {
+              value += values(i);
+            }
+            if (test != value) {
+              printf ("FAILED team_parallel_for %i %i %f %f\n",
+                      team.league_rank (), team.team_rank (),
+                      static_cast<double> (test), static_cast<double> (value));
+              flag() = 1;
+            }
+      });
+    }
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_reduce {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    Scalar value = Scalar();
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131),[&] (int i, Scalar& val) {
+      val += i - team.league_rank () + team.league_size () + team.team_size ();
+    },value);
+
+    team.team_barrier ();
+    Kokkos::single(Kokkos::PerTeam(team),[&]() {
+         Scalar test = 0;
+         for (int i = 0; i < 131; ++i) {
+           test += i - team.league_rank () + team.league_size () + team.team_size ();
+         }
+         if (test != value) {
+           if(team.league_rank() == 0)
+           printf ("FAILED team_parallel_reduce %i %i %f %f %lu\n",
+             team.league_rank (), team.team_rank (),
+             static_cast<double> (test), static_cast<double> (value),sizeof(Scalar));
+              flag() = 1;
+         }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_reduce_join {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_reduce_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    Scalar value = 0;
+
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131)
+      , [&] (int i, Scalar& val) {
+        val += i - team.league_rank () + team.league_size () + team.team_size ();
+      }
+      , [&] (volatile Scalar& val, const volatile Scalar& src) {val+=src;}
+      , value
+    );
+
+    team.team_barrier ();
+    Kokkos::single(Kokkos::PerTeam(team),[&]() {
+         Scalar test = 0;
+         for (int i = 0; i < 131; ++i) {
+           test += i - team.league_rank () + team.league_size () + team.team_size ();
+         }
+         if (test != value) {
+           printf ("FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
+             team.league_rank (), team.team_rank (),
+             static_cast<double> (test), static_cast<double> (value));
+              flag() = 1;
+         }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_vector_for {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_vector_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
+    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
+    typedef typename shared_int::size_type size_type;
+
+    const size_type shmemSize = team.team_size () * 13;
+    shared_int values = shared_int (team.team_shmem (), shmemSize);
+
+    if (values.ptr_on_device () == NULL || values.dimension_0 () < shmemSize) {
+      printf ("FAILED to allocate shared memory of size %u\n",
+              static_cast<unsigned int> (shmemSize));
+    }
+    else {
+      Kokkos::single(Kokkos::PerThread(team),[&] () {
+        values(team.team_rank ()) = 0;
+      });
+
+      Kokkos::parallel_for(Kokkos::TeamThreadRange(team,131),[&] (int i) {
+        Kokkos::single(Kokkos::PerThread(team),[&] () {
+          values(team.team_rank ()) += i - team.league_rank () + team.league_size () + team.team_size ();
+        });
+      });
+
+      team.team_barrier ();
+      Kokkos::single(Kokkos::PerTeam(team),[&]() {
+        Scalar test = 0;
+        Scalar value = 0;
+        for (int i = 0; i < 131; ++i) {
+          test += i - team.league_rank () + team.league_size () + team.team_size ();
+        }
+        for (int i = 0; i < team.team_size (); ++i) {
+          value += values(i);
+        }
+        if (test != value) {
+          printf ("FAILED team_vector_parallel_for %i %i %f %f\n",
+                  team.league_rank (), team.team_rank (),
+                  static_cast<double> (test), static_cast<double> (value));
+          flag() = 1;
+        }
+      });
+    }
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_vector_reduce {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_vector_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    Scalar value = Scalar();
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131),[&] (int i, Scalar& val) {
+        val += i - team.league_rank () + team.league_size () + team.team_size ();
+    },value);
+
+    team.team_barrier ();
+    Kokkos::single(Kokkos::PerTeam(team),[&]() {
+      Scalar test = 0;
+      for (int i = 0; i < 131; ++i) {
+        test += i - team.league_rank () + team.league_size () + team.team_size ();
+      }
+      if (test != value) {
+        if(team.league_rank() == 0)
+        printf ("FAILED team_vector_parallel_reduce %i %i %f %f %lu\n",
+          team.league_rank (), team.team_rank (),
+          static_cast<double> (test), static_cast<double> (value),sizeof(Scalar));
+           flag() = 1;
+      }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_team_vector_reduce_join {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_team_vector_reduce_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    Scalar value = 0;
+    Kokkos::parallel_reduce(Kokkos::TeamThreadRange(team,131)
+      , [&] (int i, Scalar& val) {
+        val += i - team.league_rank () + team.league_size () + team.team_size ();
+      }
+      , [&] (volatile Scalar& val, const volatile Scalar& src) {val+=src;}
+      , value
+    );
+
+    team.team_barrier ();
+    Kokkos::single(Kokkos::PerTeam(team),[&]() {
+      Scalar test = 0;
+      for (int i = 0; i < 131; ++i) {
+         test += i - team.league_rank () + team.league_size () + team.team_size ();
+      }
+      if (test != value) {
+        printf ("FAILED team_vector_parallel_reduce_join %i %i %f %f\n",
+          team.league_rank (), team.team_rank (),
+          static_cast<double> (test), static_cast<double> (value));
+        flag() = 1;
+      }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_vec_single {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_vec_single(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    // Warning: this test case intentionally violates permissable semantics
+    // It is not valid to get references to members of the enclosing region
+    // inside a parallel_for and write to it.
+    Scalar value = 0;
+
+    Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,13),[&] (int i) {
+      value = i; // This write is violating Kokkos semantics for nested parallelism
+    });
+
+    Kokkos::single(Kokkos::PerThread(team),[&] (Scalar& val) {
+      val = 1;
+    },value);
+
+    Scalar value2 = 0;
+    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13), [&] (int i, Scalar& val) {
+      val += value;
+    },value2);
+
+    if(value2!=(value*13)) {
+      printf("FAILED vector_single broadcast %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) value2,(double) value);
+      flag()=1;
+    }
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_vec_for {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_vec_for(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  unsigned team_shmem_size(int team_size) const {return team_size*13*sizeof(Scalar)+8;}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+
+    typedef typename ExecutionSpace::scratch_memory_space shmem_space ;
+    typedef Kokkos::View<Scalar*,shmem_space,Kokkos::MemoryUnmanaged> shared_int;
+    shared_int values = shared_int(team.team_shmem(),team.team_size()*13);
+
+    if (values.ptr_on_device () == NULL ||
+        values.dimension_0() < (unsigned) team.team_size() * 13) {
+      printf ("FAILED to allocate memory of size %i\n",
+              static_cast<int> (team.team_size () * 13));
+      flag() = 1;
+    }
+    else {
+      Kokkos::parallel_for(Kokkos::ThreadVectorRange(team,13), [&] (int i) {
+        values(13*team.team_rank() + i) = i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
+      });
+
+      Kokkos::single(Kokkos::PerThread(team),[&] () {
+        Scalar test = 0;
+        Scalar value = 0;
+        for (int i = 0; i < 13; ++i) {
+          test += i - team.team_rank() - team.league_rank() + team.league_size() + team.team_size();
+          value += values(13*team.team_rank() + i);
+        }
+        if (test != value) {
+          printf ("FAILED vector_par_for %i %i %f %f\n",
+                  team.league_rank (), team.team_rank (),
+                  static_cast<double> (test), static_cast<double> (value));
+          flag() = 1;
+        }
+      });
+    }
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_vec_red {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_vec_red(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+    Scalar value = 0;
+
+    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13),[&] (int i, Scalar& val) {
+      val += i;
+    }, value);
+
+    Kokkos::single(Kokkos::PerThread(team),[&] () {
+      Scalar test = 0;
+      for(int i = 0; i < 13; i++) {
+        test+=i;
+      }
+      if(test!=value) {
+        printf("FAILED vector_par_reduce %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) value);
+        flag()=1;
+      }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_vec_red_join {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_vec_red_join(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+    Scalar value = 1;
+
+    Kokkos::parallel_reduce(Kokkos::ThreadVectorRange(team,13)
+      , [&] (int i, Scalar& val) { val *= i; }
+      , [&] (Scalar& val, const Scalar& src) {val*=src;}
+      , value
+    );
+
+    Kokkos::single(Kokkos::PerThread(team),[&] () {
+      Scalar test = 1;
+      for(int i = 0; i < 13; i++) {
+        test*=i;
+      }
+      if(test!=value) {
+        printf("FAILED vector_par_reduce_join %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) value);
+        flag()=1;
+      }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_vec_scan {
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_vec_scan(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team) const {
+    Kokkos::parallel_scan(Kokkos::ThreadVectorRange(team,13),[&] (int i, Scalar& val, bool final) {
+      val += i;
+      if(final) {
+        Scalar test = 0;
+        for(int k = 0; k <= i; k++) {
+          test+=k;
+        }
+        if(test!=val) {
+          printf("FAILED vector_par_scan %i %i %f %f\n",team.league_rank(),team.team_rank(),(double) test,(double) val);
+          flag()=1;
+        }
+      }
+    });
+  }
+};
+
+template<typename Scalar, class ExecutionSpace>
+struct functor_reduce {
+  typedef double value_type;
+  typedef Kokkos::TeamPolicy<ExecutionSpace> policy_type;
+  typedef ExecutionSpace execution_space;
+
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag;
+  functor_reduce(Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> flag_):flag(flag_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (typename policy_type::member_type team, double& sum) const {
+    sum += team.league_rank() * 100 + team.thread_rank();
+  }
+};
+#endif
+
+template<typename Scalar,class ExecutionSpace>
+bool test_scalar(int nteams, int team_size, int test) {
+  Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace> d_flag("flag");
+  typename Kokkos::View<int,Kokkos::LayoutLeft,ExecutionSpace>::HostMirror h_flag("h_flag");
+  h_flag() = 0 ;
+  Kokkos::deep_copy(d_flag,h_flag);
+  #ifdef KOKKOS_HAVE_CXX11
+  if(test==0)
+  Kokkos::parallel_for( std::string("A") , Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_vec_red<Scalar, ExecutionSpace>(d_flag));
+  if(test==1)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_vec_red_join<Scalar, ExecutionSpace>(d_flag));
+  if(test==2)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_vec_scan<Scalar, ExecutionSpace>(d_flag));
+  if(test==3)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_vec_for<Scalar, ExecutionSpace>(d_flag));
+  if(test==4)
+  Kokkos::parallel_for( "B" , Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_vec_single<Scalar, ExecutionSpace>(d_flag));
+  if(test==5)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
+      functor_team_for<Scalar, ExecutionSpace>(d_flag));
+  if(test==6)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
+      functor_team_reduce<Scalar, ExecutionSpace>(d_flag));
+  if(test==7)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size),
+      functor_team_reduce_join<Scalar, ExecutionSpace>(d_flag));
+  if(test==8)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_team_vector_for<Scalar, ExecutionSpace>(d_flag));
+  if(test==9)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_team_vector_reduce<Scalar, ExecutionSpace>(d_flag));
+  if(test==10)
+  Kokkos::parallel_for( Kokkos::TeamPolicy<ExecutionSpace>(nteams,team_size,8),
+      functor_team_vector_reduce_join<Scalar, ExecutionSpace>(d_flag));
+  #endif
+  Kokkos::deep_copy(h_flag,d_flag);
+
+  return (h_flag() == 0);
+}
+
+template<class ExecutionSpace>
+bool Test(int test) {
+  bool passed = true;
+  passed = passed && test_scalar<int, ExecutionSpace>(317,33,test);
+  passed = passed && test_scalar<long long int, ExecutionSpace>(317,33,test);
+  passed = passed && test_scalar<float, ExecutionSpace>(317,33,test);
+  passed = passed && test_scalar<double, ExecutionSpace>(317,33,test);
+  passed = passed && test_scalar<my_complex, ExecutionSpace>(317,33,test);
+  return passed;
+}
+
+}
+
diff --git a/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..4f136bc64b977e3243b9aaf789d4837e7e5ca793
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTemplateMetaFunctions.hpp
@@ -0,0 +1,219 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <Kokkos_Core.hpp>
+
+#define KOKKOS_PRAGMA_UNROLL(a)
+
+namespace {
+
+template<class Scalar, class ExecutionSpace>
+struct SumPlain {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  type view;
+  SumPlain(type view_):view(view_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, Scalar& val) {
+    val += Scalar();
+  }
+};
+
+template<class Scalar, class ExecutionSpace>
+struct SumInitJoinFinalValueType {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  type view;
+  typedef Scalar value_type;
+  SumInitJoinFinalValueType(type view_):view(view_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type& val) const {
+    val = value_type();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& val, volatile value_type& src) const {
+    val += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, value_type& val) const {
+    val += value_type();
+  }
+
+};
+
+template<class Scalar, class ExecutionSpace>
+struct SumInitJoinFinalValueType2 {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  type view;
+  typedef Scalar value_type;
+  SumInitJoinFinalValueType2(type view_):view(view_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init(volatile value_type& val) const {
+    val = value_type();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& val, const volatile value_type& src) const {
+    val += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, value_type& val) const {
+    val += value_type();
+  }
+
+};
+
+template<class Scalar, class ExecutionSpace>
+struct SumInitJoinFinalValueTypeArray {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  type view;
+  typedef Scalar value_type[];
+  int n;
+  SumInitJoinFinalValueTypeArray(type view_, int n_):view(view_),n(n_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init(value_type val) const {
+    for(int k=0;k<n;k++)
+      val[k] = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type val, const volatile value_type src) const {
+    for(int k=0;k<n;k++)
+      val[k] += src[k];
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, value_type val) const {
+    for(int k=0;k<n;k++)
+      val[k] += k*i;
+  }
+
+};
+
+template<class Scalar, class ExecutionSpace>
+struct SumWrongInitJoinFinalValueType {
+  typedef ExecutionSpace execution_space;
+  typedef typename Kokkos::View<Scalar*,execution_space> type;
+  type view;
+  typedef Scalar value_type;
+  SumWrongInitJoinFinalValueType(type view_):view(view_) {}
+
+  KOKKOS_INLINE_FUNCTION
+  void init(double& val) const {
+    val = double();
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void join(volatile value_type& val, const value_type& src) const {
+    val += src;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator() (int i, value_type& val) const {
+    val += value_type();
+  }
+
+};
+
+template<class Scalar, class ExecutionSpace>
+void TestTemplateMetaFunctions() {
+  typedef typename Kokkos::View<Scalar*,ExecutionSpace> type;
+  type a("A",100);
+/*  #ifdef KOKKOS_HAVE_CXX11
+  int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_plain_has_init_arg,0);
+  int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg,1);
+  int sum_initjoinfinalvaluetype_has_init_arg2 = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg2,1);
+  int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_init_arg,0);
+
+  //int sum_initjoinfinalvaluetypearray_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueTypeArray<Scalar,ExecutionSpace>, Scalar[] >::value;
+  //ASSERT_EQ(sum_initjoinfinalvaluetypearray_has_init_arg,1);
+
+  #else
+
+  int sum_plain_has_init_arg = Kokkos::Impl::FunctorHasInit<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_plain_has_init_arg,0);
+  int sum_initjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_init_arg,1);
+  int sum_wronginitjoinfinalvaluetype_has_init_arg = Kokkos::Impl::FunctorHasInit<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_init_arg,1);
+
+  #endif
+
+  //printf("Values Init: %i %i %i\n",sum_plain_has_init_arg,sum_initjoinfinalvaluetype_has_init_arg,sum_wronginitjoinfinalvaluetype_has_init_arg);
+
+#ifdef KOKKOS_HAVE_CXX11
+  int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumPlain<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_plain_has_join_arg,0);
+  int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg,1);
+  int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg2,1);
+  int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar >::value;
+  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_join_arg,0);
+#else
+  int sum_plain_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumPlain<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_plain_has_join_arg,0);
+  int sum_initjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg,1);
+  int sum_initjoinfinalvaluetype_has_join_arg2 = Kokkos::Impl::FunctorHasJoin<SumInitJoinFinalValueType2<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_initjoinfinalvaluetype_has_join_arg2,1);
+  int sum_wronginitjoinfinalvaluetype_has_join_arg = Kokkos::Impl::FunctorHasJoin<SumWrongInitJoinFinalValueType<Scalar,ExecutionSpace>, Scalar& >::value;
+  ASSERT_EQ(sum_wronginitjoinfinalvaluetype_has_join_arg,1);
+#endif*/
+  //printf("Values Join: %i %i %i\n",sum_plain_has_join_arg,sum_initjoinfinalvaluetype_has_join_arg,sum_wronginitjoinfinalvaluetype_has_join_arg);
+}
+
+}
diff --git a/lib/kokkos/core/unit_test/TestThreads.cpp b/lib/kokkos/core/unit_test/TestThreads.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..3832998ab5f04fdf91020691539872a48733b8fd
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestThreads.cpp
@@ -0,0 +1,443 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Macros.hpp>
+
+#if defined( KOKKOS_HAVE_PTHREAD )
+
+#include <Kokkos_Core.hpp>
+
+#include <Threads/Kokkos_Threads_TaskPolicy.hpp>
+
+//----------------------------------------------------------------------------
+
+#include <TestSharedAlloc.hpp>
+#include <TestViewMapping.hpp>
+
+#include <TestViewImpl.hpp>
+
+#include <TestViewAPI.hpp>
+#include <TestViewSubview.hpp>
+#include <TestAtomic.hpp>
+
+#include <TestReduce.hpp>
+#include <TestScan.hpp>
+#include <TestRange.hpp>
+#include <TestTeam.hpp>
+#include <TestAggregate.hpp>
+#include <TestAggregateReduction.hpp>
+#include <TestCompilerMacros.hpp>
+#include <TestCXX11.hpp>
+#include <TestCXX11Deduction.hpp>
+#include <TestTeamVector.hpp>
+#include <TestMemorySpaceTracking.hpp>
+#include <TestTemplateMetaFunctions.hpp>
+
+#include <TestTaskPolicy.hpp>
+
+namespace Test {
+
+class threads : public ::testing::Test {
+protected:
+  static void SetUpTestCase()
+  {
+    // Finalize without initialize is a no-op:
+    Kokkos::Threads::finalize();
+
+    const unsigned numa_count       = Kokkos::hwloc::get_available_numa_count();
+    const unsigned cores_per_numa   = Kokkos::hwloc::get_available_cores_per_numa();
+    const unsigned threads_per_core = Kokkos::hwloc::get_available_threads_per_core();
+
+    unsigned threads_count = 0 ;
+
+    // Initialize and finalize with no threads:
+    Kokkos::Threads::initialize( 1u );
+    Kokkos::Threads::finalize();
+
+    threads_count = std::max( 1u , numa_count )
+                  * std::max( 2u , cores_per_numa * threads_per_core );
+
+    Kokkos::Threads::initialize( threads_count );
+    Kokkos::Threads::finalize();
+
+    
+    threads_count = std::max( 1u , numa_count * 2 )
+                  * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
+
+    Kokkos::Threads::initialize( threads_count );
+    Kokkos::Threads::finalize();
+
+    // Quick attempt to verify thread start/terminate don't have race condition:
+    threads_count = std::max( 1u , numa_count )
+                  * std::max( 2u , ( cores_per_numa * threads_per_core ) / 2 );
+    for ( unsigned i = 0 ; i < 10 ; ++i ) {
+      Kokkos::Threads::initialize( threads_count );
+      Kokkos::Threads::sleep();
+      Kokkos::Threads::wake();
+      Kokkos::Threads::finalize();
+    }
+
+    Kokkos::Threads::initialize( threads_count );
+    Kokkos::Threads::print_configuration( std::cout , true /* detailed */ );
+  }
+
+  static void TearDownTestCase()
+  {
+    Kokkos::Threads::finalize();
+  }
+};
+
+TEST_F( threads , init ) {
+  ;
+}
+
+TEST_F( threads , impl_shared_alloc ) {
+  test_shared_alloc< Kokkos::HostSpace , Kokkos::Threads >();
+}
+
+TEST_F( threads , impl_view_mapping ) {
+  test_view_mapping< Kokkos::Threads >();
+  test_view_mapping_subview< Kokkos::Threads >();
+  test_view_mapping_operator< Kokkos::Threads >();
+  TestViewMappingAtomic< Kokkos::Threads >::run();
+}
+
+
+TEST_F( threads, view_impl) {
+  test_view_impl< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_api) {
+  TestViewAPI< double , Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_auto_1d_left ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutLeft,Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_auto_1d_right ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutRight,Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_auto_1d_stride ) {
+  TestViewSubview::test_auto_1d< Kokkos::LayoutStride,Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_assign_strided ) {
+  TestViewSubview::test_1d_strided_assignment< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_left_0 ) {
+  TestViewSubview::test_left_0< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_left_1 ) {
+  TestViewSubview::test_left_1< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_left_2 ) {
+  TestViewSubview::test_left_2< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_left_3 ) {
+  TestViewSubview::test_left_3< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_right_0 ) {
+  TestViewSubview::test_right_0< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_right_1 ) {
+  TestViewSubview::test_right_1< Kokkos::Threads >();
+}
+
+TEST_F( threads, view_subview_right_3 ) {
+  TestViewSubview::test_right_3< Kokkos::Threads >();
+}
+
+
+TEST_F( threads, view_aggregate ) {
+  TestViewAggregate< Kokkos::Threads >();
+  TestViewAggregateReduction< Kokkos::Threads >();
+}
+
+TEST_F( threads , range_tag )
+{
+  TestRange< Kokkos::Threads >::test_for(1000);
+  TestRange< Kokkos::Threads >::test_reduce(1000);
+  TestRange< Kokkos::Threads >::test_scan(1000);
+}
+
+TEST_F( threads , team_tag )
+{
+  TestTeamPolicy< Kokkos::Threads >::test_for(1000);
+  TestTeamPolicy< Kokkos::Threads >::test_reduce(1000);
+}
+
+TEST_F( threads, long_reduce) {
+  TestReduce< long ,   Kokkos::Threads >( 1000000 );
+}
+
+TEST_F( threads, double_reduce) {
+  TestReduce< double ,   Kokkos::Threads >( 1000000 );
+}
+
+TEST_F( threads, team_long_reduce) {
+  TestReduceTeam< long ,   Kokkos::Threads >( 100000 );
+}
+
+TEST_F( threads, team_double_reduce) {
+  TestReduceTeam< double ,   Kokkos::Threads >( 100000 );
+}
+
+TEST_F( threads, long_reduce_dynamic ) {
+  TestReduceDynamic< long ,   Kokkos::Threads >( 1000000 );
+}
+
+TEST_F( threads, double_reduce_dynamic ) {
+  TestReduceDynamic< double ,   Kokkos::Threads >( 1000000 );
+}
+
+TEST_F( threads, long_reduce_dynamic_view ) {
+  TestReduceDynamicView< long ,   Kokkos::Threads >( 1000000 );
+}
+
+TEST_F( threads, team_shared_request) {
+  TestSharedTeam< Kokkos::Threads >();
+}
+
+TEST_F( threads , view_remap )
+{
+  enum { N0 = 3 , N1 = 2 , N2 = 8 , N3 = 9 };
+
+  typedef Kokkos::View< double*[N1][N2][N3] ,
+                             Kokkos::LayoutRight ,
+                             Kokkos::Threads > output_type ;
+
+  typedef Kokkos::View< int**[N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Threads > input_type ;
+
+  typedef Kokkos::View< int*[N0][N2][N3] ,
+                             Kokkos::LayoutLeft ,
+                             Kokkos::Threads > diff_type ;
+
+  output_type output( "output" , N0 );
+  input_type  input ( "input" , N0 , N1 );
+  diff_type   diff  ( "diff" , N0 );
+
+  int value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    input(i0,i1,i2,i3) = ++value ;
+  }}}}
+
+  // Kokkos::deep_copy( diff , input ); // throw with incompatible shape
+  Kokkos::deep_copy( output , input );
+
+  value = 0 ;
+  for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+  for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+  for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+  for ( size_t i0 = 0 ; i0 < N0 ; ++i0 ) {
+    ++value ;
+    ASSERT_EQ( value , ((int) output(i0,i1,i2,i3) ) );
+  }}}}
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( threads , atomics )
+{
+  const int loop_count = 1e6 ;
+
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<int,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned int,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long int,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<unsigned long int,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<long long int,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<double,Kokkos::Threads>(loop_count,3) ) );
+
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<float,Kokkos::Threads>(100,3) ) );
+
+#if defined( KOKKOS_ENABLE_ASM )
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,2) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<Kokkos::complex<double> ,Kokkos::Threads>(100,3) ) );
+#endif
+
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<3>, Kokkos::Threads>(loop_count,1) ) );
+  ASSERT_TRUE( ( TestAtomic::Loop<TestAtomic::SuperScalar<3>, Kokkos::Threads>(loop_count,2) ) );
+}
+
+//----------------------------------------------------------------------------
+
+#if 0
+TEST_F( threads , scan_small )
+{
+  typedef TestScan< Kokkos::Threads , Kokkos::Impl::ThreadsExecUseScanSmall > TestScanFunctor ;
+  for ( int i = 0 ; i < 1000 ; ++i ) {
+    TestScanFunctor( 10 );
+    TestScanFunctor( 10000 );
+  }
+  TestScanFunctor( 1000000 );
+  TestScanFunctor( 10000000 );
+
+  Kokkos::Threads::fence();
+}
+#endif
+
+TEST_F( threads , scan )
+{
+  TestScan< Kokkos::Threads >::test_range( 1 , 1000 );
+  TestScan< Kokkos::Threads >( 1000000 );
+  TestScan< Kokkos::Threads >( 10000000 );
+  Kokkos::Threads::fence();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( threads , team_scan )
+{
+  TestScanTeam< Kokkos::Threads >( 10 );
+  TestScanTeam< Kokkos::Threads >( 10000 );
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( threads , compiler_macros )
+{
+  ASSERT_TRUE( ( TestCompilerMacros::Test< Kokkos::Threads >() ) );
+}
+
+TEST_F( threads , memory_space )
+{
+  TestMemorySpace< Kokkos::Threads >();
+}
+
+//----------------------------------------------------------------------------
+
+TEST_F( threads , template_meta_functions )
+{
+  TestTemplateMetaFunctions<int, Kokkos::Threads >();
+}
+
+//----------------------------------------------------------------------------
+
+#if defined( KOKKOS_HAVE_CXX11 ) && defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
+TEST_F( threads , cxx11 )
+{
+  if ( Kokkos::Impl::is_same< Kokkos::DefaultExecutionSpace , Kokkos::Threads >::value ) {
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(1) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(2) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(3) ) );
+    ASSERT_TRUE( ( TestCXX11::Test< Kokkos::Threads >(4) ) );
+  }
+}
+#endif
+
+#if defined (KOKKOS_HAVE_CXX11)
+
+TEST_F( threads , reduction_deduction )
+{
+  TestCXX11::test_reduction_deduction< Kokkos::Threads >();
+}
+
+TEST_F( threads , team_vector )
+{
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(0) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(1) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(2) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(3) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(4) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(5) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(6) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(7) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(8) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(9) ) );
+  ASSERT_TRUE( ( TestTeamVector::Test< Kokkos::Threads >(10) ) );
+}
+
+#endif
+
+TEST_F( threads , task_policy )
+{
+  TestTaskPolicy::test_task_dep< Kokkos::Threads >( 10 );
+  for ( long i = 0 ; i < 25 ; ++i ) TestTaskPolicy::test_fib< Kokkos::Threads >(i);
+  for ( long i = 0 ; i < 35 ; ++i ) TestTaskPolicy::test_fib2< Kokkos::Threads >(i);
+}
+
+#if defined( KOKKOS_HAVE_CXX11 )
+TEST_F( threads , task_team )
+{
+  TestTaskPolicy::test_task_team< Kokkos::Threads >(1000);
+}
+#endif
+
+
+} // namespace Test
+
+#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
diff --git a/lib/kokkos/core/unit_test/TestTile.hpp b/lib/kokkos/core/unit_test/TestTile.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..dfb2bd81b3dec3485688f9827d3f1f7ad24ddb9d
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestTile.hpp
@@ -0,0 +1,153 @@
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+
+#ifndef TEST_TILE_HPP
+#define TEST_TILE_HPP
+
+#include <Kokkos_Core.hpp>
+
+namespace TestTile {
+
+template < typename Device , typename TileLayout>
+struct ReduceTileErrors
+{
+  typedef Device execution_space ;
+
+  typedef Kokkos::View< ptrdiff_t**, TileLayout, Device>  array_type;
+  typedef Kokkos::View< ptrdiff_t[ TileLayout::N0 ][ TileLayout::N1 ], Kokkos::LayoutLeft , Device >  tile_type ;
+
+  array_type m_array ;
+
+  typedef ptrdiff_t value_type;
+
+  ReduceTileErrors( array_type a )
+    : m_array(a)
+  {}
+
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & errors )
+  {
+    errors = 0;
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & errors ,
+                    const volatile value_type & src_errors )
+  {
+    errors += src_errors;
+  }
+
+  // Initialize
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t iwork ) const
+  {
+    const size_t i = iwork % m_array.dimension_0();
+    const size_t j = iwork / m_array.dimension_0();
+    if ( j < m_array.dimension_1() ) {
+      m_array(i,j) = & m_array(i,j) - & m_array(0,0);
+
+// printf("m_array(%d,%d) = %d\n",int(i),int(j),int(m_array(i,j)));
+
+    }
+  }
+
+  // Verify:
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t iwork , value_type & errors ) const
+  {
+    const size_t tile_dim0 = ( m_array.dimension_0() + TileLayout::N0 - 1 ) / TileLayout::N0 ;
+    const size_t tile_dim1 = ( m_array.dimension_1() + TileLayout::N1 - 1 ) / TileLayout::N1 ;
+
+    const size_t itile = iwork % tile_dim0 ;
+    const size_t jtile = iwork / tile_dim0 ;
+
+    if ( jtile < tile_dim1 ) {
+
+      tile_type tile = Kokkos::tile_subview( m_array , itile , jtile );
+
+      if ( tile(0,0) != ptrdiff_t(( itile + jtile * tile_dim0 ) * TileLayout::N0 * TileLayout::N1 ) ) {
+        ++errors ;
+      }
+      else {
+
+        for ( size_t j = 0 ; j < size_t(TileLayout::N1) ; ++j ) {
+        for ( size_t i = 0 ; i < size_t(TileLayout::N0) ; ++i ) {
+          const size_t iglobal = i + itile * TileLayout::N0 ;
+          const size_t jglobal = j + jtile * TileLayout::N1 ;
+
+          if ( iglobal < m_array.dimension_0() && jglobal < m_array.dimension_1() ) {
+            if ( tile(i,j) != ptrdiff_t( tile(0,0) + i + j * TileLayout::N0 ) ) ++errors ;
+
+// printf("tile(%d,%d)(%d,%d) = %d\n",int(itile),int(jtile),int(i),int(j),int(tile(i,j)));
+
+          }
+        }
+        }
+      }
+    }
+  }
+};
+
+template< class Space , unsigned N0 , unsigned N1 >
+void test( const size_t dim0 , const size_t dim1 )
+{
+  typedef Kokkos::LayoutTileLeft<N0,N1>  array_layout ;
+  typedef ReduceTileErrors< Space , array_layout > functor_type ;
+
+  const size_t tile_dim0 = ( dim0 + N0 - 1 ) / N0 ;
+  const size_t tile_dim1 = ( dim1 + N1 - 1 ) / N1 ;
+  
+  typename functor_type::array_type array("",dim0,dim1);
+
+  Kokkos::parallel_for( Kokkos::RangePolicy<Space,size_t>(0,dim0*dim1) , functor_type( array ) );
+
+  ptrdiff_t error = 0 ;
+
+  Kokkos::parallel_reduce( Kokkos::RangePolicy<Space,size_t>(0,tile_dim0*tile_dim1) , functor_type( array ) , error );
+
+  EXPECT_EQ( error , ptrdiff_t(0) );
+}
+
+} /* namespace TestTile */
+
+#endif //TEST_TILE_HPP
+
diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..b0a81cec6beefc38233685e506e514c1595dc4ef
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp
@@ -0,0 +1,1305 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+namespace Test {
+
+template< typename T, class DeviceType >
+class TestViewAPI {
+public:
+  TestViewAPI() {}
+};
+
+}
+
+#else
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class T , class L , class D , class M , class S >
+size_t allocation_count( const Kokkos::View<T,L,D,M,S> & view )
+{
+  const size_t card  = Kokkos::Impl::cardinality_count( view.shape() );
+  const size_t alloc = view.capacity();
+
+  return card <= alloc ? alloc : 0 ;
+}
+
+/*--------------------------------------------------------------------------*/
+
+template< typename T, class DeviceType>
+struct TestViewOperator
+{
+  typedef DeviceType  execution_space ;
+
+  static const unsigned N = 100 ;
+  static const unsigned D = 3 ;
+
+  typedef Kokkos::View< T*[D] , execution_space > view_type ;
+
+  const view_type v1 ;
+  const view_type v2 ;
+
+  TestViewOperator()
+    : v1( "v1" , N )
+    , v2( "v2" , N )
+    {}
+
+  static void testit()
+  {
+    Kokkos::parallel_for( N , TestViewOperator() );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const unsigned i ) const
+  {
+    const unsigned X = 0 ;
+    const unsigned Y = 1 ;
+    const unsigned Z = 2 ;
+
+    v2(i,X) = v1(i,X);
+    v2(i,Y) = v1(i,Y);
+    v2(i,Z) = v1(i,Z);
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class DataType >
+struct rank {
+private:
+  typedef typename Kokkos::Impl::AnalyzeShape<DataType>::shape shape ;
+public:
+  static const unsigned value = shape::rank ;
+};
+
+template< class DataType ,
+          class DeviceType ,
+          unsigned Rank = rank< DataType >::value >
+struct TestViewOperator_LeftAndRight ;
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 8 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  typedef typename left_view ::shape_type  left_shape ;
+  typedef typename right_view::shape_type  right_shape ;
+
+  left_shape   lsh ;
+  right_shape  rsh ;
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : lsh()
+    , rsh()
+    , left(  "left" )
+    , right( "right" )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc );
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc );
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i7 = 0 ; i7 < unsigned(lsh.N7) ; ++i7 )
+    for ( unsigned i6 = 0 ; i6 < unsigned(lsh.N6) ; ++i6 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(lsh.N5) ; ++i5 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(lsh.N4) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(lsh.N3) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5, i6, i7 ) -
+                     & left(  0,  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+
+      if ( & left(i0,i1,i2,i3,i4,i5,i6,i7) !=
+           & left_stride(i0,i1,i2,i3,i4,i5,i6,i7) ) {
+        update |= 4 ;
+      }
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(rsh.N3) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(rsh.N4) ; ++i4 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(rsh.N5) ; ++i5 )
+    for ( unsigned i6 = 0 ; i6 < unsigned(rsh.N6) ; ++i6 )
+    for ( unsigned i7 = 0 ; i7 < unsigned(rsh.N7) ; ++i7 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5, i6, i7 ) -
+                     & right(  0,  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+
+      if ( & right(i0,i1,i2,i3,i4,i5,i6,i7) !=
+           & right_stride(i0,i1,i2,i3,i4,i5,i6,i7) ) {
+        update |= 8 ;
+      }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 7 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef typename left_view ::shape_type  left_shape ;
+  typedef typename right_view::shape_type  right_shape ;
+
+  left_shape   lsh ;
+  right_shape  rsh ;
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : lsh()
+    , rsh()
+    , left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc );
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc );
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i6 = 0 ; i6 < unsigned(lsh.N6) ; ++i6 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(lsh.N5) ; ++i5 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(lsh.N4) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(lsh.N3) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5, i6 ) -
+                     & left(  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(rsh.N3) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(rsh.N4) ; ++i4 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(rsh.N5) ; ++i5 )
+    for ( unsigned i6 = 0 ; i6 < unsigned(rsh.N6) ; ++i6 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5, i6 ) -
+                     & right(  0,  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 6 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef typename left_view ::shape_type  left_shape ;
+  typedef typename right_view::shape_type  right_shape ;
+
+  left_shape   lsh ;
+  right_shape  rsh ;
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : lsh()
+    , rsh()
+    , left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc );
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc );
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i5 = 0 ; i5 < unsigned(lsh.N5) ; ++i5 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(lsh.N4) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(lsh.N3) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4, i5 ) -
+                     & left(  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(rsh.N3) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(rsh.N4) ; ++i4 )
+    for ( unsigned i5 = 0 ; i5 < unsigned(rsh.N5) ; ++i5 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4, i5 ) -
+                     & right(  0,  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 5 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  typedef typename left_view ::shape_type  left_shape ;
+  typedef typename right_view::shape_type  right_shape ;
+
+  left_shape   lsh ;
+  right_shape  rsh ;
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : lsh()
+    , rsh()
+    , left(  "left" )
+    , right( "right" )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc );
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc );
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i4 = 0 ; i4 < unsigned(lsh.N4) ; ++i4 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(lsh.N3) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3, i4 ) -
+                     & left(  0,  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+
+      if ( & left( i0, i1, i2, i3, i4 ) !=
+           & left_stride( i0, i1, i2, i3, i4 ) ) { update |= 4 ; }
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(rsh.N3) ; ++i3 )
+    for ( unsigned i4 = 0 ; i4 < unsigned(rsh.N4) ; ++i4 )
+    {
+      const long j = & right( i0, i1, i2, i3, i4 ) -
+                     & right(  0,  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+
+      if ( & right( i0, i1, i2, i3, i4 ) !=
+           & right_stride( i0, i1, i2, i3, i4 ) ) { update |= 8 ; }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 4 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef typename left_view ::shape_type  left_shape ;
+  typedef typename right_view::shape_type  right_shape ;
+
+  left_shape   lsh ;
+  right_shape  rsh ;
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : lsh()
+    , rsh()
+    , left(  "left" )
+    , right( "right" )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc );
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc );
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i3 = 0 ; i3 < unsigned(lsh.N3) ; ++i3 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2, i3 ) -
+                     & left(  0,  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 )
+    for ( unsigned i3 = 0 ; i3 < unsigned(rsh.N3) ; ++i3 )
+    {
+      const long j = & right( i0, i1, i2, i3 ) -
+                     & right(  0,  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 3 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  typedef typename left_view ::shape_type  left_shape ;
+  typedef typename right_view::shape_type  right_shape ;
+
+  left_shape   lsh ;
+  right_shape  rsh ;
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : lsh()
+    , rsh()
+    , left(  std::string("left") )
+    , right( std::string("right") )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc );
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc );
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 )
+    {
+      const long j = & left( i0, i1, i2 ) -
+                     & left(  0,  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+
+      if ( & left(i0,i1,i2) != & left_stride(i0,i1,i2) ) { update |= 4 ; }
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(rsh.N2) ; ++i2 )
+    {
+      const long j = & right( i0, i1, i2 ) -
+                     & right(  0,  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+
+      if ( & right(i0,i1,i2) != & right_stride(i0,i1,i2) ) { update |= 8 ; }
+    }
+
+    for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 )
+    for ( unsigned i2 = 0 ; i2 < unsigned(lsh.N2) ; ++i2 )
+    {
+      if ( & left(i0,i1,i2)  != & left.at(i0,i1,i2,0,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0,i1,i2) != & right.at(i0,i1,i2,0,0,0,0,0) ) { update |= 3 ; }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 2 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef typename left_view ::shape_type  left_shape ;
+  typedef typename right_view::shape_type  right_shape ;
+
+  left_shape   lsh ;
+  right_shape  rsh ;
+  left_view    left ;
+  right_view   right ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : lsh()
+    , rsh()
+    , left(  Kokkos::ViewAllocate("left") )
+    , right( Kokkos::ViewAllocate("right") )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc );
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc );
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    long offset ;
+
+    offset = -1 ;
+    for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 )
+    for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 )
+    {
+      const long j = & left( i0, i1 ) -
+                     & left(  0,  0 );
+      if ( j <= offset || left_alloc <= j ) { update |= 1 ; }
+      offset = j ;
+    }
+
+    offset = -1 ;
+    for ( unsigned i0 = 0 ; i0 < unsigned(rsh.N0) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(rsh.N1) ; ++i1 )
+    {
+      const long j = & right( i0, i1 ) -
+                     & right(  0,  0 );
+      if ( j <= offset || right_alloc <= j ) { update |= 2 ; }
+      offset = j ;
+    }
+
+    for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 )
+    for ( unsigned i1 = 0 ; i1 < unsigned(lsh.N1) ; ++i1 )
+    {
+      if ( & left(i0,i1)  != & left.at(i0,i1,0,0,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0,i1) != & right.at(i0,i1,0,0,0,0,0,0) ) { update |= 3 ; }
+    }
+  }
+};
+
+template< class DataType , class DeviceType >
+struct TestViewOperator_LeftAndRight< DataType , DeviceType , 1 >
+{
+  typedef DeviceType                          execution_space ;
+  typedef typename execution_space::memory_space  memory_space ;
+  typedef typename execution_space::size_type     size_type ;
+
+  typedef int value_type ;
+
+  KOKKOS_INLINE_FUNCTION
+  static void join( volatile value_type & update ,
+                    const volatile value_type & input )
+    { update |= input ; }
+
+  KOKKOS_INLINE_FUNCTION
+  static void init( value_type & update )
+    { update = 0 ; }
+
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutLeft, execution_space > left_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutRight, execution_space > right_view ;
+
+  typedef Kokkos::
+    View< DataType, Kokkos::LayoutStride, execution_space > stride_view ;
+
+  typedef typename left_view ::shape_type  left_shape ;
+  typedef typename right_view::shape_type  right_shape ;
+
+  left_shape   lsh ;
+  right_shape  rsh ;
+  left_view    left ;
+  right_view   right ;
+  stride_view  left_stride ;
+  stride_view  right_stride ;
+  long         left_alloc ;
+  long         right_alloc ;
+
+  TestViewOperator_LeftAndRight()
+    : lsh()
+    , rsh()
+    , left(  Kokkos::ViewAllocate() )
+    , right( Kokkos::ViewAllocate() )
+    , left_stride( left )
+    , right_stride( right )
+    , left_alloc( allocation_count( left ) )
+    , right_alloc( allocation_count( right ) )
+    {}
+
+  static void testit()
+  {
+    TestViewOperator_LeftAndRight driver ;
+
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.lsh ) <= driver.left_alloc );
+    ASSERT_TRUE( (long) Kokkos::Impl::cardinality_count( driver.rsh ) <= driver.right_alloc );
+
+    int error_flag = 0 ;
+
+    Kokkos::parallel_reduce( 1 , driver , error_flag );
+
+    ASSERT_EQ( error_flag , 0 );
+  }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const size_type , value_type & update ) const
+  {
+    for ( unsigned i0 = 0 ; i0 < unsigned(lsh.N0) ; ++i0 )
+    {
+      if ( & left(i0)  != & left.at(i0,0,0,0,0,0,0,0) )  { update |= 3 ; }
+      if ( & right(i0) != & right.at(i0,0,0,0,0,0,0,0) ) { update |= 3 ; }
+      if ( & left(i0)  != & left_stride(i0) ) { update |= 4 ; }
+      if ( & right(i0) != & right_stride(i0) ) { update |= 8 ; }
+    }
+  }
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< typename T, class DeviceType >
+class TestViewAPI
+{
+public:
+  typedef DeviceType        device ;
+
+  enum { N0 = 1000 ,
+         N1 = 3 ,
+         N2 = 5 ,
+         N3 = 7 };
+
+  typedef Kokkos::View< T , device > dView0 ;
+  typedef Kokkos::View< T* , device > dView1 ;
+  typedef Kokkos::View< T*[N1] , device > dView2 ;
+  typedef Kokkos::View< T*[N1][N2] , device > dView3 ;
+  typedef Kokkos::View< T*[N1][N2][N3] , device > dView4 ;
+  typedef Kokkos::View< const T*[N1][N2][N3] , device > const_dView4 ;
+
+  typedef Kokkos::View< T****, device, Kokkos::MemoryUnmanaged > dView4_unmanaged ;
+
+  typedef typename dView0::host_mirror_space host ;
+
+  TestViewAPI()
+  {
+    run_test_mirror();
+    run_test();
+    run_test_scalar();
+    run_test_const();
+    run_test_subview();
+    run_test_subview_strided();
+    run_test_vector();
+
+    TestViewOperator< T , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2][3] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4][2] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3][4] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2][3] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4][2] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3][4] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2][3] , device >::testit();
+    TestViewOperator_LeftAndRight< int[2] , device >::testit();
+  }
+
+  static void run_test_mirror()
+  {
+    typedef Kokkos::View< int , host > view_type ;
+    typedef typename view_type::HostMirror mirror_type ;
+    view_type a("a");
+    mirror_type am = Kokkos::create_mirror_view(a);
+    mirror_type ax = Kokkos::create_mirror(a);
+    ASSERT_EQ( & a() , & am() );
+  }
+
+  static void run_test_scalar()
+  {
+    typedef typename dView0::HostMirror  hView0 ;
+
+    dView0 dx , dy ;
+    hView0 hx , hy ;
+
+    dx = dView0( "dx" );
+    dy = dView0( "dy" );
+
+    hx = Kokkos::create_mirror( dx );
+    hy = Kokkos::create_mirror( dy );
+
+    hx = 1 ;
+
+    Kokkos::deep_copy( dx , hx );
+    Kokkos::deep_copy( dy , dx );
+    Kokkos::deep_copy( hy , dy );
+
+    ASSERT_EQ( hx(), hy() );
+  }
+
+  static void run_test()
+  {
+    // mfh 14 Feb 2014: This test doesn't actually create instances of
+    // these types.  In order to avoid "declared but unused typedef"
+    // warnings, we declare empty instances of these types, with the
+    // usual "(void)" marker to avoid compiler warnings for unused
+    // variables.
+
+    typedef typename dView0::HostMirror  hView0 ;
+    typedef typename dView1::HostMirror  hView1 ;
+    typedef typename dView2::HostMirror  hView2 ;
+    typedef typename dView3::HostMirror  hView3 ;
+    typedef typename dView4::HostMirror  hView4 ;
+
+    {
+      hView0 thing;
+      (void) thing;
+    }
+    {
+      hView1 thing;
+      (void) thing;
+    }
+    {
+      hView2 thing;
+      (void) thing;
+    }
+    {
+      hView3 thing;
+      (void) thing;
+    }
+    {
+      hView4 thing;
+      (void) thing;
+    }
+
+    dView4 dx , dy , dz ;
+    hView4 hx , hy , hz ;
+
+    ASSERT_TRUE( dx.is_null() );
+    ASSERT_TRUE( dy.is_null() );
+    ASSERT_TRUE( dz.is_null() );
+    ASSERT_TRUE( hx.is_null() );
+    ASSERT_TRUE( hy.is_null() );
+    ASSERT_TRUE( hz.is_null() );
+    ASSERT_EQ( dx.dimension_0() , 0u );
+    ASSERT_EQ( dy.dimension_0() , 0u );
+    ASSERT_EQ( dz.dimension_0() , 0u );
+    ASSERT_EQ( hx.dimension_0() , 0u );
+    ASSERT_EQ( hy.dimension_0() , 0u );
+    ASSERT_EQ( hz.dimension_0() , 0u );
+    ASSERT_EQ( dx.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( dy.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( dz.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( hz.dimension_1() , unsigned(N1) );
+
+    dx = dView4( "dx" , N0 );
+    dy = dView4( "dy" , N0 );
+
+
+
+    dView4_unmanaged unmanaged_dx = dx;
+    dView4_unmanaged unmanaged_from_ptr_dx = dView4_unmanaged(dx.ptr_on_device(),
+                                                              dx.dimension_0(),
+                                                              dx.dimension_1(),
+                                                              dx.dimension_2(),
+                                                              dx.dimension_3());
+
+    {
+      // Destruction of this view should be harmless
+      const_dView4 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
+                                                dx.dimension_0() ,
+                                                dx.dimension_1() ,
+                                                dx.dimension_2() ,
+                                                dx.dimension_3() );
+    }
+
+    const_dView4 const_dx = dx ;
+
+
+    ASSERT_FALSE( dx.is_null() );
+    ASSERT_FALSE( const_dx.is_null() );
+    ASSERT_FALSE( unmanaged_dx.is_null() );
+    ASSERT_FALSE( unmanaged_from_ptr_dx.is_null() );
+    ASSERT_FALSE( dy.is_null() );
+    ASSERT_NE( dx , dy );
+
+    ASSERT_EQ( dx.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( dx.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( dx.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( dx.dimension_3() , unsigned(N3) );
+
+    ASSERT_EQ( dy.dimension_0() , unsigned(N0) );
+    ASSERT_EQ( dy.dimension_1() , unsigned(N1) );
+    ASSERT_EQ( dy.dimension_2() , unsigned(N2) );
+    ASSERT_EQ( dy.dimension_3() , unsigned(N3) );
+
+    ASSERT_EQ( unmanaged_from_ptr_dx.capacity(),unsigned(N0)*unsigned(N1)*unsigned(N2)*unsigned(N3) );
+
+    hx = Kokkos::create_mirror( dx );
+    hy = Kokkos::create_mirror( dy );
+
+    // T v1 = hx() ;    // Generates compile error as intended
+    // T v2 = hx(0,0) ; // Generates compile error as intended
+    // hx(0,0) = v2 ;   // Generates compile error as intended
+
+    size_t count = 0 ;
+    for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+    for ( size_t i1 = 0 ; i1 < hx.dimension_1() ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < hx.dimension_2() ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < hx.dimension_3() ; ++i3 ) {
+      hx(ip,i1,i2,i3) = ++count ;
+    }}}}
+
+    Kokkos::deep_copy( dx , hx );
+    Kokkos::deep_copy( dy , dx );
+    Kokkos::deep_copy( hy , dy );
+
+    for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+    for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+      { ASSERT_EQ( hx(ip,i1,i2,i3) , hy(ip,i1,i2,i3) ); }
+    }}}}
+
+    Kokkos::deep_copy( dx , T(0) );
+    Kokkos::deep_copy( hx , dx );
+
+    for ( size_t ip = 0 ; ip < N0 ; ++ip ) {
+    for ( size_t i1 = 0 ; i1 < N1 ; ++i1 ) {
+    for ( size_t i2 = 0 ; i2 < N2 ; ++i2 ) {
+    for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
+      { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
+    }}}}
+
+    dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
+    dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
+
+    dx = dView4();
+    ASSERT_TRUE( dx.is_null() );
+    ASSERT_FALSE( dy.is_null() );
+    ASSERT_FALSE( dz.is_null() );
+    dy = dView4();
+    ASSERT_TRUE( dx.is_null() );
+    ASSERT_TRUE( dy.is_null() );
+    ASSERT_FALSE( dz.is_null() );
+    dz = dView4();
+    ASSERT_TRUE( dx.is_null() );
+    ASSERT_TRUE( dy.is_null() );
+    ASSERT_TRUE( dz.is_null() );
+  }
+
+  typedef T DataType[2] ;
+
+  static void
+  check_auto_conversion_to_const(
+     const Kokkos::View< const DataType , device > & arg_const ,
+     const Kokkos::View< DataType , device > & arg )
+  {
+    ASSERT_TRUE( arg_const == arg );
+  }
+
+  static void run_test_const()
+  {
+    typedef Kokkos::View< DataType , device > typeX ;
+    typedef Kokkos::View< const DataType , device > const_typeX ;
+    typedef Kokkos::View< const DataType , device , Kokkos::MemoryRandomAccess > const_typeR ;
+    typeX x( "X" );
+    const_typeX xc = x ;
+    const_typeR xr = x ;
+
+    ASSERT_TRUE( xc == x );
+    ASSERT_TRUE( x == xc );
+    ASSERT_TRUE( x.ptr_on_device() == xr.ptr_on_device() );
+
+    // typeX xf = xc ; // setting non-const from const must not compile
+
+    check_auto_conversion_to_const( x , x );
+  }
+
+  static void run_test_subview()
+  {
+    typedef Kokkos::View< const T , device > sView ;
+
+    dView0 d0( "d0" );
+    dView1 d1( "d1" , N0 );
+    dView2 d2( "d2" , N0 );
+    dView3 d3( "d3" , N0 );
+    dView4 d4( "d4" , N0 );
+
+    sView s0 = d0 ;
+    sView s1 = Kokkos::subview( d1 , 1 );
+    sView s2 = Kokkos::subview( d2 , 1 , 1 );
+    sView s3 = Kokkos::subview( d3 , 1 , 1 , 1 );
+    sView s4 = Kokkos::subview( d4 , 1 , 1 , 1 , 1 );
+  }
+
+  static void run_test_subview_strided()
+  {
+    typedef Kokkos::View< int **** , Kokkos::LayoutLeft  , host >  view_left_4 ;
+    typedef Kokkos::View< int **** , Kokkos::LayoutRight , host >  view_right_4 ;
+    typedef Kokkos::View< int **   , Kokkos::LayoutLeft  , host >  view_left_2 ;
+    typedef Kokkos::View< int **   , Kokkos::LayoutRight , host >  view_right_2 ;
+
+    typedef Kokkos::View< int * ,  Kokkos::LayoutStride , host >  view_stride_1 ;
+    typedef Kokkos::View< int ** ,  Kokkos::LayoutStride , host >  view_stride_2 ;
+
+    view_left_2  xl2("xl2", 100 , 200 );
+    view_right_2 xr2("xr2", 100 , 200 );
+    view_stride_1  yl1 = Kokkos::subview( xl2 , 0 , Kokkos::ALL() );
+    view_stride_1  yl2 = Kokkos::subview( xl2 , 1 , Kokkos::ALL() );
+    view_stride_1  yr1 = Kokkos::subview( xr2 , 0 , Kokkos::ALL() );
+    view_stride_1  yr2 = Kokkos::subview( xr2 , 1 , Kokkos::ALL() );
+
+    ASSERT_EQ( yl1.dimension_0() , xl2.dimension_1() );
+    ASSERT_EQ( yl2.dimension_0() , xl2.dimension_1() );
+    ASSERT_EQ( yr1.dimension_0() , xr2.dimension_1() );
+    ASSERT_EQ( yr2.dimension_0() , xr2.dimension_1() );
+
+    ASSERT_EQ( & yl1(0) - & xl2(0,0) , 0 );
+    ASSERT_EQ( & yl2(0) - & xl2(1,0) , 0 );
+    ASSERT_EQ( & yr1(0) - & xr2(0,0) , 0 );
+    ASSERT_EQ( & yr2(0) - & xr2(1,0) , 0 );
+
+    view_left_4 xl4( "xl4" , 10 , 20 , 30 , 40 );
+    view_right_4 xr4( "xr4" , 10 , 20 , 30 , 40 );
+
+    view_stride_2 yl4 = Kokkos::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+    view_stride_2 yr4 = Kokkos::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
+
+    ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
+    ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
+    ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
+    ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
+
+    ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
+    ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
+  }
+
+  static void run_test_vector()
+  {
+    static const unsigned Length = 1000 , Count = 8 ;
+
+    typedef Kokkos::View< T* ,  Kokkos::LayoutLeft , host > vector_type ;
+    typedef Kokkos::View< T** , Kokkos::LayoutLeft , host > multivector_type ;
+
+    typedef Kokkos::View< T* ,  Kokkos::LayoutRight , host > vector_right_type ;
+    typedef Kokkos::View< T** , Kokkos::LayoutRight , host > multivector_right_type ;
+
+    typedef Kokkos::View< const T* , Kokkos::LayoutRight, host > const_vector_right_type ;
+    typedef Kokkos::View< const T* , Kokkos::LayoutLeft , host > const_vector_type ;
+    typedef Kokkos::View< const T** , Kokkos::LayoutLeft , host > const_multivector_type ;
+
+    multivector_type mv = multivector_type( "mv" , Length , Count );
+    multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
+
+    vector_type v1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
+    vector_type v2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
+    vector_type v3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+
+    vector_type rv1 = Kokkos::subview( mv_right , 0 , Kokkos::ALL() );
+    vector_type rv2 = Kokkos::subview( mv_right , 1 , Kokkos::ALL() );
+    vector_type rv3 = Kokkos::subview( mv_right , 2 , Kokkos::ALL() );
+
+    multivector_type mv1 = Kokkos::subview( mv , std::make_pair( 1 , 998 ) ,
+                                                 std::make_pair( 2 , 5 ) );
+
+    multivector_right_type mvr1 =
+      Kokkos::subview( mv_right ,
+                       std::make_pair( 1 , 998 ) ,
+                       std::make_pair( 2 , 5 ) );
+
+    const_vector_type cv1 = Kokkos::subview( mv , Kokkos::ALL(), 0 );
+    const_vector_type cv2 = Kokkos::subview( mv , Kokkos::ALL(), 1 );
+    const_vector_type cv3 = Kokkos::subview( mv , Kokkos::ALL(), 2 );
+
+    vector_right_type vr1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
+    vector_right_type vr2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
+    vector_right_type vr3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+
+    const_vector_right_type cvr1 = Kokkos::subview( mv , Kokkos::ALL() , 0 );
+    const_vector_right_type cvr2 = Kokkos::subview( mv , Kokkos::ALL() , 1 );
+    const_vector_right_type cvr3 = Kokkos::subview( mv , Kokkos::ALL() , 2 );
+
+    ASSERT_TRUE( & v1[0] == & v1(0) );
+    ASSERT_TRUE( & v1[0] == & mv(0,0) );
+    ASSERT_TRUE( & v2[0] == & mv(0,1) );
+    ASSERT_TRUE( & v3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & cv1[0] == & mv(0,0) );
+    ASSERT_TRUE( & cv2[0] == & mv(0,1) );
+    ASSERT_TRUE( & cv3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & vr1[0] == & mv(0,0) );
+    ASSERT_TRUE( & vr2[0] == & mv(0,1) );
+    ASSERT_TRUE( & vr3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & cvr1[0] == & mv(0,0) );
+    ASSERT_TRUE( & cvr2[0] == & mv(0,1) );
+    ASSERT_TRUE( & cvr3[0] == & mv(0,2) );
+
+    ASSERT_TRUE( & mv1(0,0) == & mv( 1 , 2 ) );
+    ASSERT_TRUE( & mv1(1,1) == & mv( 2 , 3 ) );
+    ASSERT_TRUE( & mv1(3,2) == & mv( 4 , 4 ) );
+    ASSERT_TRUE( & mvr1(0,0) == & mv_right( 1 , 2 ) );
+    ASSERT_TRUE( & mvr1(1,1) == & mv_right( 2 , 3 ) );
+    ASSERT_TRUE( & mvr1(3,2) == & mv_right( 4 , 4 ) );
+
+    const_vector_type c_cv1( v1 );
+    typename vector_type::const_type c_cv2( v2 );
+    typename const_vector_type::const_type c_ccv2( v2 );
+
+    const_multivector_type cmv( mv );
+    typename multivector_type::const_type cmvX( cmv );
+    typename const_multivector_type::const_type ccmvX( cmv );
+  }
+};
+
+} // namespace Test
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestViewImpl.hpp b/lib/kokkos/core/unit_test/TestViewImpl.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..c51588777be7e7694a27b1ba24ce1f0fc45c0dc1
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewImpl.hpp
@@ -0,0 +1,289 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+namespace Test {
+
+template < class Device >
+void test_view_impl() {}
+
+}
+
+#else
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+struct DummyMemorySpace
+{
+  typedef DummyMemorySpace memory_space ;
+  typedef unsigned size_type ;
+};
+
+/*--------------------------------------------------------------------------*/
+
+template< class Type >
+struct DefineShape {
+  typedef typename Kokkos::Impl::AnalyzeShape<Type>::shape type ;
+};
+
+template< class Type >
+struct ExtractValueType {
+  typedef typename Kokkos::Impl::AnalyzeShape<Type>::value_type type ;
+};
+
+template< class Type >
+struct ArrayType { typedef Type type ; };
+
+template < class Device >
+void test_view_impl()
+{
+  //typedef typename Device::memory_space memory_space ; // unused
+
+  typedef ArrayType< int[100]                >::type type_01 ;
+  typedef ArrayType< int*                    >::type type_11 ;
+  typedef ArrayType< int[5][6][700]          >::type type_03 ;
+  typedef ArrayType< double*[8][9][900]      >::type type_14 ;
+  typedef ArrayType< long**                  >::type type_22 ;
+  typedef ArrayType< short **[5][6][7]       >::type type_25 ;
+  typedef ArrayType< const short **[5][6][7] >::type const_type_25 ;
+  typedef ArrayType< short***[5][6][7]       >::type type_36 ;
+  typedef ArrayType< const short***[5][6][7] >::type const_type_36 ;
+
+  // mfh 14 Feb 2014: With gcc 4.8.2 -Wall, this emits a warning:
+  //
+  // typedef ‘ok_const_25’ locally defined but not used [-Wunused-local-typedefs]
+  //
+  // It's unfortunate that this is the case, because the typedef is
+  // being used for a compile-time check!  We deal with this by
+  // declaring an instance of ok_const_25, and marking it with
+  // "(void)" so that instance doesn't emit an "unused variable"
+  // warning.
+  //
+  // typedef typename Kokkos::Impl::StaticAssertSame<
+  //    typename Kokkos::Impl::AnalyzeShape<type_25>::const_type ,
+  //    typename Kokkos::Impl::AnalyzeShape<const_type_25>::type
+  //      > ok_const_25 ;
+
+  typedef typename Kokkos::Impl::StaticAssertSame<
+    typename Kokkos::Impl::AnalyzeShape<type_25>::const_type,
+    typename Kokkos::Impl::AnalyzeShape<const_type_25>::type
+      > ok_const_25 ;
+
+  typedef typename Kokkos::Impl::StaticAssertSame<
+    typename Kokkos::Impl::AnalyzeShape<type_36>::const_type,
+    typename Kokkos::Impl::AnalyzeShape<const_type_36>::type
+      > ok_const_36 ;
+  {
+    ok_const_25 thing_25 ;
+    ok_const_36 thing_36 ;
+    (void) thing_25 ; // silence warning for unused variable
+    (void) thing_36 ; // silence warning for unused variable
+  }
+
+  ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_03>::type , int >::value ) );
+  ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_14>::type , double >::value ) );
+  ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_22>::type , long >::value ) );
+  ASSERT_TRUE( ( Kokkos::Impl::is_same< ExtractValueType<type_36>::type , short >::value ) );
+
+  ASSERT_FALSE( ( Kokkos::Impl::is_same< ExtractValueType<type_36>::type , int >::value ) );
+
+  typedef typename DefineShape< type_01 >::type  shape_01_type ;
+  typedef typename DefineShape< type_11 >::type  shape_11_type ;
+  typedef typename DefineShape< type_03 >::type  shape_03_type ;
+  typedef typename DefineShape< type_14 >::type  shape_14_type ;
+  typedef typename DefineShape< type_22 >::type  shape_22_type ;
+  typedef typename DefineShape< type_36 >::type  shape_36_type ;
+
+  ASSERT_TRUE( ( Kokkos::Impl::StaticAssert< shape_36_type::rank == 6 >::value ) );
+  ASSERT_TRUE( ( Kokkos::Impl::StaticAssert< shape_03_type::rank == 3 >::value ) );
+
+  shape_01_type shape_01 ; shape_01_type::assign( shape_01 );
+  shape_11_type shape_11 ; shape_11_type::assign( shape_11, 1000 );
+  shape_03_type shape_03 ; shape_03_type::assign( shape_03 );
+  shape_14_type shape_14 ; shape_14_type::assign( shape_14 , 0 );
+  shape_22_type shape_22 ; shape_22_type::assign( shape_22 , 0 , 0 );
+  shape_36_type shape_36 ; shape_36_type::assign( shape_36 , 10 , 20 , 30 );
+
+  ASSERT_TRUE( shape_01.rank_dynamic == 0u );
+  ASSERT_TRUE( shape_01.rank         == 1u );
+  ASSERT_TRUE( shape_01.N0           == 100u );
+
+  ASSERT_TRUE( shape_11.rank_dynamic == 1u );
+  ASSERT_TRUE( shape_11.rank         == 1u );
+  ASSERT_TRUE( shape_11.N0           == 1000u );
+
+  ASSERT_TRUE( shape_03.rank_dynamic == 0u );
+  ASSERT_TRUE( shape_03.rank         == 3u );
+  ASSERT_TRUE( shape_03.N0           == 5u );
+  ASSERT_TRUE( shape_03.N1           == 6u );
+  ASSERT_TRUE( shape_03.N2           == 700u );
+
+  ASSERT_TRUE( shape_14.rank_dynamic == 1u );
+  ASSERT_TRUE( shape_14.rank         == 4u );
+  ASSERT_TRUE( shape_14.N0           == 0u );
+  ASSERT_TRUE( shape_14.N1           == 8u );
+  ASSERT_TRUE( shape_14.N2           == 9u );
+  ASSERT_TRUE( shape_14.N3           == 900u );
+
+  ASSERT_TRUE( shape_22.rank_dynamic == 2u );
+  ASSERT_TRUE( shape_22.rank         == 2u );
+  ASSERT_TRUE( shape_22.N0           == 0u );
+  ASSERT_TRUE( shape_22.N1           == 0u );
+
+  ASSERT_TRUE( shape_36.rank_dynamic == 3u );
+  ASSERT_TRUE( shape_36.rank         == 6u );
+  ASSERT_TRUE( shape_36.N0           == 10u );
+  ASSERT_TRUE( shape_36.N1           == 20u );
+  ASSERT_TRUE( shape_36.N2           == 30u );
+  ASSERT_TRUE( shape_36.N3           == 5u  );
+  ASSERT_TRUE( shape_36.N4           == 6u  );
+  ASSERT_TRUE( shape_36.N5           == 7u  );
+
+
+  ASSERT_TRUE( shape_01 == shape_01 );
+  ASSERT_TRUE( shape_11 == shape_11 );
+  ASSERT_TRUE( shape_36 == shape_36 );
+  ASSERT_TRUE( shape_01 != shape_36 );
+  ASSERT_TRUE( shape_22 != shape_36 );
+
+  //------------------------------------------------------------------------
+
+  typedef Kokkos::Impl::ViewOffset< shape_01_type , Kokkos::LayoutLeft > shape_01_left_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_11_type , Kokkos::LayoutLeft > shape_11_left_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_03_type , Kokkos::LayoutLeft > shape_03_left_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_14_type , Kokkos::LayoutLeft > shape_14_left_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_22_type , Kokkos::LayoutLeft > shape_22_left_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_36_type , Kokkos::LayoutLeft > shape_36_left_offset ;
+
+  typedef Kokkos::Impl::ViewOffset< shape_01_type , Kokkos::LayoutRight > shape_01_right_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_11_type , Kokkos::LayoutRight > shape_11_right_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_03_type , Kokkos::LayoutRight > shape_03_right_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_14_type , Kokkos::LayoutRight > shape_14_right_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_22_type , Kokkos::LayoutRight > shape_22_right_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_36_type , Kokkos::LayoutRight > shape_36_right_offset ;
+
+  ASSERT_TRUE( ! shape_01_left_offset::has_padding );
+  ASSERT_TRUE( ! shape_11_left_offset::has_padding );
+  ASSERT_TRUE( ! shape_03_left_offset::has_padding );
+  ASSERT_TRUE(   shape_14_left_offset::has_padding );
+  ASSERT_TRUE(   shape_22_left_offset::has_padding );
+  ASSERT_TRUE(   shape_36_left_offset::has_padding );
+
+  ASSERT_TRUE( ! shape_01_right_offset::has_padding );
+  ASSERT_TRUE( ! shape_11_right_offset::has_padding );
+  ASSERT_TRUE( ! shape_03_right_offset::has_padding );
+  ASSERT_TRUE( ! shape_14_right_offset::has_padding );
+  ASSERT_TRUE(   shape_22_right_offset::has_padding );
+  ASSERT_TRUE(   shape_36_right_offset::has_padding );
+
+  //------------------------------------------------------------------------
+
+  typedef Kokkos::Impl::ViewOffset< shape_01_type , Kokkos::LayoutStride > shape_01_stride_offset ;
+  typedef Kokkos::Impl::ViewOffset< shape_36_type , Kokkos::LayoutStride > shape_36_stride_offset ;
+
+  {
+    shape_01_stride_offset stride_offset_01 ;
+
+    stride_offset_01.assign( 1, stride_offset_01.N0, 0,0,0,0,0,0,0 );
+
+    ASSERT_EQ( int(stride_offset_01.S[0]) , int(1) );
+    ASSERT_EQ( int(stride_offset_01.S[1]) , int(stride_offset_01.N0) );
+  }
+
+  {
+    shape_36_stride_offset stride_offset_36 ;
+
+    size_t str[7] ;
+    str[5] = 1 ;
+    str[4] = str[5] * stride_offset_36.N5 ;
+    str[3] = str[4] * stride_offset_36.N4 ;
+    str[2] = str[3] * stride_offset_36.N3 ;
+    str[1] = str[2] * 100 ;
+    str[0] = str[1] * 200 ;
+    str[6] = str[0] * 300 ;
+
+    stride_offset_36.assign( str[0] , str[1] , str[2] , str[3] , str[4] , str[5] , str[6] , 0 , 0 );
+
+    ASSERT_EQ( size_t(stride_offset_36.S[6]) , size_t(str[6]) );
+    ASSERT_EQ( size_t(stride_offset_36.N2)   , size_t(100) );
+    ASSERT_EQ( size_t(stride_offset_36.N1)   , size_t(200) );
+    ASSERT_EQ( size_t(stride_offset_36.N0)   , size_t(300) );
+  }
+
+  //------------------------------------------------------------------------
+
+  {
+    const int rank = 6 ;
+    const int order[] = { 5 , 3 , 1 , 0 , 2 , 4 };
+    const unsigned dim[] = { 2 , 3 , 5 , 7 , 11 , 13 };
+    Kokkos::LayoutStride stride_6 = Kokkos::LayoutStride::order_dimensions( rank , order , dim );
+    size_t n = 1 ;
+    for ( int i = 0 ; i < rank ; ++i ) {
+      ASSERT_EQ( size_t(dim[i]) , size_t( stride_6.dimension[i] ) );
+      ASSERT_EQ( size_t(n) , size_t( stride_6.stride[ order[i] ] ) );
+      n *= dim[order[i]] ;
+    }
+  }
+
+  //------------------------------------------------------------------------
+}
+
+} /* namespace Test */
+
+#endif
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestViewMapping.hpp b/lib/kokkos/core/unit_test/TestViewMapping.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..31e0c6a7b04690382d1c608664680f089e54fb5a
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewMapping.hpp
@@ -0,0 +1,1018 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+#include <Kokkos_Core.hpp>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+template< class RangeType >
+void test_view_range( const size_t N , const RangeType & range , const size_t begin , const size_t dim )
+{
+  typedef Kokkos::Experimental::Impl::ViewOffsetRange< RangeType >  query ;
+
+  ASSERT_EQ( query::begin( range ) , begin );
+  ASSERT_EQ( query::dimension( N , range ) , dim );
+  ASSERT_EQ( query::is_range , dim != 0 );
+}
+
+
+template< class ExecSpace >
+void test_view_mapping()
+{
+  typedef Kokkos::Experimental::Impl::ViewDimension<>  dim_0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<2> dim_s2 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<2,3> dim_s2_s3 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<2,3,4> dim_s2_s3_s4 ;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension<0> dim_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,3> dim_s0_s3 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,3,4> dim_s0_s3_s4 ;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0> dim_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,4> dim_s0_s0_s4 ;
+
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0> dim_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0> dim_s0_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0> dim_s0_s0_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0_s0 ;
+  typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0,0,0,0,0,0> dim_s0_s0_s0_s0_s0_s0_s0_s0 ;
+
+  // Fully static dimensions should not be larger than an int
+  ASSERT_LE( sizeof(dim_0) , sizeof(int) );
+  ASSERT_LE( sizeof(dim_s2) , sizeof(int) );
+  ASSERT_LE( sizeof(dim_s2_s3) , sizeof(int) );
+  ASSERT_LE( sizeof(dim_s2_s3_s4) , sizeof(int) );
+
+  // Rank 1 is size_t
+  ASSERT_EQ( sizeof(dim_s0) , sizeof(size_t) );
+  ASSERT_EQ( sizeof(dim_s0_s3) , sizeof(size_t) );
+  ASSERT_EQ( sizeof(dim_s0_s3_s4) , sizeof(size_t) );
+
+  // Allow for padding
+  ASSERT_LE( sizeof(dim_s0_s0) , 2 * sizeof(size_t) );
+  ASSERT_LE( sizeof(dim_s0_s0_s4) , 2 * sizeof(size_t) );
+
+  ASSERT_LE( sizeof(dim_s0_s0_s0) , 4 * sizeof(size_t) );
+  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0) , 4 * sizeof(unsigned) );
+  ASSERT_LE( sizeof(dim_s0_s0_s0_s0_s0) , 6 * sizeof(unsigned) );
+  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0_s0_s0) , 6 * sizeof(unsigned) );
+  ASSERT_LE( sizeof(dim_s0_s0_s0_s0_s0_s0_s0) , 8 * sizeof(unsigned) );
+  ASSERT_EQ( sizeof(dim_s0_s0_s0_s0_s0_s0_s0_s0) , 8 * sizeof(unsigned) );
+
+  ASSERT_EQ( int(dim_0::rank) , int(0) );
+  ASSERT_EQ( int(dim_0::rank_dynamic) , int(0) );
+
+  ASSERT_EQ( int(dim_s2::rank) , int(1) );
+  ASSERT_EQ( int(dim_s2::rank_dynamic) , int(0) );
+
+  ASSERT_EQ( int(dim_s2_s3::rank) , int(2) );
+  ASSERT_EQ( int(dim_s2_s3::rank_dynamic) , int(0) );
+
+  ASSERT_EQ( int(dim_s2_s3_s4::rank) , int(3) );
+  ASSERT_EQ( int(dim_s2_s3_s4::rank_dynamic) , int(0) );
+
+  ASSERT_EQ( int(dim_s0::rank) , int(1) );
+  ASSERT_EQ( int(dim_s0::rank_dynamic) , int(1) );
+
+  ASSERT_EQ( int(dim_s0_s3::rank) , int(2) );
+  ASSERT_EQ( int(dim_s0_s3::rank_dynamic) , int(1) );
+
+  ASSERT_EQ( int(dim_s0_s3_s4::rank) , int(3) );
+  ASSERT_EQ( int(dim_s0_s3_s4::rank_dynamic) , int(1) );
+
+  ASSERT_EQ( int(dim_s0_s0_s4::rank) , int(3) );
+  ASSERT_EQ( int(dim_s0_s0_s4::rank_dynamic) , int(2) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0::rank) , int(3) );
+  ASSERT_EQ( int(dim_s0_s0_s0::rank_dynamic) , int(3) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0_s0::rank) , int(4) );
+  ASSERT_EQ( int(dim_s0_s0_s0_s0::rank_dynamic) , int(4) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0::rank) , int(5) );
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0::rank_dynamic) , int(5) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0::rank) , int(6) );
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0::rank_dynamic) , int(6) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0::rank) , int(7) );
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) , int(7) );
+
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank) , int(8) );
+  ASSERT_EQ( int(dim_s0_s0_s0_s0_s0_s0_s0_s0::rank_dynamic) , int(8) );
+
+  dim_s0          d1( 2, 3, 4, 5, 6, 7, 8, 9 ); 
+  dim_s0_s0       d2( 2, 3, 4, 5, 6, 7, 8, 9 );
+  dim_s0_s0_s0    d3( 2, 3, 4, 5, 6, 7, 8, 9 );
+  dim_s0_s0_s0_s0 d4( 2, 3, 4, 5, 6, 7, 8, 9 );
+
+  ASSERT_EQ( d1.N0 , 2 );
+  ASSERT_EQ( d2.N0 , 2 );
+  ASSERT_EQ( d3.N0 , 2 );
+  ASSERT_EQ( d4.N0 , 2 );
+
+  ASSERT_EQ( d1.N1 , 1 );
+  ASSERT_EQ( d2.N1 , 3 );
+  ASSERT_EQ( d3.N1 , 3 );
+  ASSERT_EQ( d4.N1 , 3 );
+
+  ASSERT_EQ( d1.N2 , 1 );
+  ASSERT_EQ( d2.N2 , 1 );
+  ASSERT_EQ( d3.N2 , 4 );
+  ASSERT_EQ( d4.N2 , 4 );
+
+  ASSERT_EQ( d1.N3 , 1 );
+  ASSERT_EQ( d2.N3 , 1 );
+  ASSERT_EQ( d3.N3 , 1 );
+  ASSERT_EQ( d4.N3 , 5 );
+
+  //----------------------------------------
+
+  typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s0 , Kokkos::LayoutStride >  stride_s0_s0_s0 ;
+
+  //----------------------------------------
+  // Static dimension
+  {
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4 , Kokkos::LayoutLeft > left_s2_s3_s4 ;
+
+    ASSERT_EQ( sizeof(left_s2_s3_s4) , sizeof(dim_s2_s3_s4) );
+
+    left_s2_s3_s4 off3 ;
+
+    stride_s0_s0_s0  stride3( off3 );
+
+    ASSERT_EQ( off3.stride_0() , 1 );
+    ASSERT_EQ( off3.stride_1() , 2 );
+    ASSERT_EQ( off3.stride_2() , 6 );
+    ASSERT_EQ( off3.span() , 24 );
+
+    ASSERT_EQ( off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( off3.stride_2() , stride3.stride_2() );
+    ASSERT_EQ( off3.span() , stride3.span() );
+
+    int offset = 0 ;
+
+    for ( int k = 0 ; k < 4 ; ++k ){
+    for ( int j = 0 ; j < 3 ; ++j ){
+    for ( int i = 0 ; i < 2 ; ++i , ++offset ){
+      ASSERT_EQ( off3(i,j,k) , offset );
+      ASSERT_EQ( stride3(i,j,k) , off3(i,j,k) );
+    }}}
+  }
+
+  //----------------------------------------
+  // Small dimension is unpadded
+  {
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+
+    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), 2, 3, 0, 0, 0, 0, 0, 0 );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0 , 2 );
+    ASSERT_EQ( dyn_off3.m_dim.N1 , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
+    ASSERT_EQ( dyn_off3.size() , 2 * 3 * 4 );
+
+    ASSERT_EQ( stride3.m_dim.rank , 3 );
+    ASSERT_EQ( stride3.m_dim.N0 , 2 );
+    ASSERT_EQ( stride3.m_dim.N1 , 3 );
+    ASSERT_EQ( stride3.m_dim.N2 , 4 );
+    ASSERT_EQ( stride3.m_dim.N3 , 1 );
+    ASSERT_EQ( stride3.size() , 2 * 3 * 4 );
+
+    int offset = 0 ;
+
+    for ( int k = 0 ; k < 4 ; ++k ){
+    for ( int j = 0 ; j < 3 ; ++j ){
+    for ( int i = 0 ; i < 2 ; ++i , ++offset ){
+      ASSERT_EQ( offset , dyn_off3(i,j,k) );
+      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
+    }}}
+
+    ASSERT_EQ( dyn_off3.span() , offset );
+    ASSERT_EQ( stride3.span() , dyn_off3.span() );
+  }
+
+  // Large dimension is likely padded
+  {
+    constexpr int N0 = 2000 ;
+    constexpr int N1 = 300 ;
+
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+
+    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), N0, N1, 0, 0, 0, 0, 0, 0 );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0 , N0 );
+    ASSERT_EQ( dyn_off3.m_dim.N1 , N1 );
+    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
+    ASSERT_EQ( dyn_off3.size() , N0 * N1 * 4 );
+
+    ASSERT_EQ( stride3.m_dim.rank , 3 );
+    ASSERT_EQ( stride3.m_dim.N0 , N0 );
+    ASSERT_EQ( stride3.m_dim.N1 , N1 );
+    ASSERT_EQ( stride3.m_dim.N2 , 4 );
+    ASSERT_EQ( stride3.m_dim.N3 , 1 );
+    ASSERT_EQ( stride3.size() , N0 * N1 * 4 );
+    ASSERT_EQ( stride3.span() , dyn_off3.span() );
+
+    int offset = 0 ;
+
+    for ( int k = 0 ; k < 4 ; ++k ){
+    for ( int j = 0 ; j < N1 ; ++j ){
+    for ( int i = 0 ; i < N0 ; ++i ){
+      ASSERT_LE( offset , dyn_off3(i,j,k) );
+      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
+      offset = dyn_off3(i,j,k) + 1 ;
+    }}}
+
+    ASSERT_LE( offset , dyn_off3.span() );
+  }
+
+  //----------------------------------------
+  // Static dimension
+  {
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s2_s3_s4 , Kokkos::LayoutRight > right_s2_s3_s4 ;
+
+    ASSERT_EQ( sizeof(right_s2_s3_s4) , sizeof(dim_s2_s3_s4) );
+
+    right_s2_s3_s4 off3 ;
+
+    stride_s0_s0_s0  stride3( off3 );
+
+    ASSERT_EQ( off3.stride_0() , 12 );
+    ASSERT_EQ( off3.stride_1() , 4 );
+    ASSERT_EQ( off3.stride_2() , 1 );
+
+    ASSERT_EQ( off3.dimension_0() , stride3.dimension_0() );
+    ASSERT_EQ( off3.dimension_1() , stride3.dimension_1() );
+    ASSERT_EQ( off3.dimension_2() , stride3.dimension_2() );
+    ASSERT_EQ( off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( off3.stride_2() , stride3.stride_2() );
+    ASSERT_EQ( off3.span() , stride3.span() );
+
+    int offset = 0 ;
+
+    for ( int i = 0 ; i < 2 ; ++i ){
+    for ( int j = 0 ; j < 3 ; ++j ){
+    for ( int k = 0 ; k < 4 ; ++k , ++offset ){
+      ASSERT_EQ( off3(i,j,k) , offset );
+      ASSERT_EQ( off3(i,j,k) , stride3(i,j,k) );
+    }}}
+
+    ASSERT_EQ( off3.span() , offset );
+  }
+
+  //----------------------------------------
+  // Small dimension is unpadded
+  {
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+
+    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), 2, 3, 0, 0, 0, 0, 0, 0 );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0 , 2 );
+    ASSERT_EQ( dyn_off3.m_dim.N1 , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
+    ASSERT_EQ( dyn_off3.size() , 2 * 3 * 4 );
+
+    ASSERT_EQ( dyn_off3.dimension_0() , stride3.dimension_0() );
+    ASSERT_EQ( dyn_off3.dimension_1() , stride3.dimension_1() );
+    ASSERT_EQ( dyn_off3.dimension_2() , stride3.dimension_2() );
+    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
+    ASSERT_EQ( dyn_off3.span() , stride3.span() );
+
+    int offset = 0 ;
+
+    for ( int i = 0 ; i < 2 ; ++i ){
+    for ( int j = 0 ; j < 3 ; ++j ){
+    for ( int k = 0 ; k < 4 ; ++k , ++offset ){
+      ASSERT_EQ( offset , dyn_off3(i,j,k) );
+      ASSERT_EQ( dyn_off3(i,j,k) , stride3(i,j,k) );
+    }}}
+
+    ASSERT_EQ( dyn_off3.span() , offset );
+  }
+
+  // Large dimension is likely padded
+  {
+    constexpr int N0 = 2000 ;
+    constexpr int N1 = 300 ;
+
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+
+    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), N0, N1, 0, 0, 0, 0, 0, 0 );
+
+    stride_s0_s0_s0  stride3( dyn_off3 );
+
+    ASSERT_EQ( dyn_off3.m_dim.rank , 3 );
+    ASSERT_EQ( dyn_off3.m_dim.N0 , N0 );
+    ASSERT_EQ( dyn_off3.m_dim.N1 , N1 );
+    ASSERT_EQ( dyn_off3.m_dim.N2 , 4 );
+    ASSERT_EQ( dyn_off3.m_dim.N3 , 1 );
+    ASSERT_EQ( dyn_off3.size() , N0 * N1 * 4 );
+
+    ASSERT_EQ( dyn_off3.dimension_0() , stride3.dimension_0() );
+    ASSERT_EQ( dyn_off3.dimension_1() , stride3.dimension_1() );
+    ASSERT_EQ( dyn_off3.dimension_2() , stride3.dimension_2() );
+    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
+    ASSERT_EQ( dyn_off3.span() , stride3.span() );
+
+    int offset = 0 ;
+
+    for ( int i = 0 ; i < N0 ; ++i ){
+    for ( int j = 0 ; j < N1 ; ++j ){
+    for ( int k = 0 ; k < 4 ; ++k ){
+      ASSERT_LE( offset , dyn_off3(i,j,k) );
+      ASSERT_EQ( dyn_off3(i,j,k) , stride3(i,j,k) );
+      offset = dyn_off3(i,j,k) + 1 ;
+    }}}
+
+    ASSERT_LE( offset , dyn_off3.span() );
+  }
+
+  //----------------------------------------
+  // Subview
+  {
+    constexpr int N0 = 2000 ;
+    constexpr int N1 = 300 ;
+
+    constexpr int sub_N0 = 1000 ;
+    constexpr int sub_N1 = 200 ;
+    constexpr int sub_N2 = 4 ;
+
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutLeft > left_s0_s0_s4 ;
+
+    left_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), N0, N1, 0, 0, 0, 0, 0, 0 );
+
+    stride_s0_s0_s0  stride3( dyn_off3 , sub_N0 , sub_N1 , sub_N2 , 0 , 0 , 0 , 0 , 0 );
+
+    ASSERT_EQ( stride3.dimension_0() , sub_N0 );
+    ASSERT_EQ( stride3.dimension_1() , sub_N1 );
+    ASSERT_EQ( stride3.dimension_2() , sub_N2 );
+    ASSERT_EQ( stride3.size() , sub_N0 * sub_N1 * sub_N2 );
+
+    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
+    ASSERT_GE( dyn_off3.span()   , stride3.span() );
+
+    for ( int k = 0 ; k < sub_N2 ; ++k ){
+    for ( int j = 0 ; j < sub_N1 ; ++j ){
+    for ( int i = 0 ; i < sub_N0 ; ++i ){
+      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
+    }}}
+  }
+
+  {
+    constexpr int N0 = 2000 ;
+    constexpr int N1 = 300 ;
+
+    constexpr int sub_N0 = 1000 ;
+    constexpr int sub_N1 = 200 ;
+    constexpr int sub_N2 = 4 ;
+
+    typedef Kokkos::Experimental::Impl::ViewOffset< dim_s0_s0_s4 , Kokkos::LayoutRight > right_s0_s0_s4 ;
+
+    right_s0_s0_s4 dyn_off3( std::integral_constant<unsigned,sizeof(int)>(), N0, N1, 0, 0, 0, 0, 0, 0 );
+
+    stride_s0_s0_s0  stride3( dyn_off3 , sub_N0 , sub_N1 , sub_N2 , 0 , 0 , 0 , 0 , 0 );
+
+    ASSERT_EQ( stride3.dimension_0() , sub_N0 );
+    ASSERT_EQ( stride3.dimension_1() , sub_N1 );
+    ASSERT_EQ( stride3.dimension_2() , sub_N2 );
+    ASSERT_EQ( stride3.size() , sub_N0 * sub_N1 * sub_N2 );
+
+    ASSERT_EQ( dyn_off3.stride_0() , stride3.stride_0() );
+    ASSERT_EQ( dyn_off3.stride_1() , stride3.stride_1() );
+    ASSERT_EQ( dyn_off3.stride_2() , stride3.stride_2() );
+    ASSERT_GE( dyn_off3.span()   , stride3.span() );
+
+    for ( int i = 0 ; i < sub_N0 ; ++i ){
+    for ( int j = 0 ; j < sub_N1 ; ++j ){
+    for ( int k = 0 ; k < sub_N2 ; ++k ){
+      ASSERT_EQ( stride3(i,j,k) , dyn_off3(i,j,k) );
+    }}}
+  }
+
+  //----------------------------------------
+  {
+    constexpr int N = 1000 ;
+
+    test_view_range( N , N / 2 , N / 2 , 0 );
+    test_view_range( N , Kokkos::Experimental::ALL , 0 , N );
+    test_view_range( N , std::pair<int,int>( N / 4 , 10 + N / 4 ) , N / 4 , 10 );
+    test_view_range( N , Kokkos::pair<int,int>( N / 4 , 10 + N / 4 ) , N / 4 , 10 );
+  }
+  //----------------------------------------
+  // view data analysis
+
+  {
+    typedef Kokkos::Experimental::Impl::ViewDataAnalysis< const int[] >  a_const_int_r1 ;
+
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::specialize , void >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::dimension , Kokkos::Experimental::Impl::ViewDimension<0> >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::type , const int[] >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::value_type , const int >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::array_scalar_type , const int[] >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::const_type , const int[] >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::const_value_type , const int >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::const_array_scalar_type , const int[] >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::non_const_type , int [] >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r1::non_const_value_type , int >::value ));
+
+    typedef Kokkos::Experimental::Impl::ViewDataAnalysis< const int**[4] >  a_const_int_r3 ;
+
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::specialize , void >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::dimension , Kokkos::Experimental::Impl::ViewDimension<0,0,4> >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::type , const int**[4] >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::value_type , const int >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::array_scalar_type , const int**[4] >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::const_type , const int**[4] >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::const_value_type , const int >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::const_array_scalar_type , const int**[4] >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::non_const_type , int**[4] >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::non_const_value_type , int >::value ));
+    ASSERT_TRUE( ( std::is_same< typename a_const_int_r3::non_const_array_scalar_type , int**[4] >::value ));
+  }
+
+  //----------------------------------------
+
+  {
+    constexpr int N = 10 ;
+
+    typedef Kokkos::Experimental::View<int*,ExecSpace>        T ;
+    typedef Kokkos::Experimental::View<const int*,ExecSpace>  C ;
+
+    int data[N] ;
+
+    T vr1(data,N);
+    C cr1(vr1);
+
+    // Generate static_assert error:
+    // T tmp( cr1 );
+
+    ASSERT_EQ( vr1.span() , N );
+    ASSERT_EQ( cr1.span() , N );
+    ASSERT_EQ( vr1.data() , & data[0] );
+    ASSERT_EQ( cr1.data() , & data[0] );
+
+    ASSERT_TRUE( ( std::is_same< typename T::data_type           , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_data_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::array_scalar_type           , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_array_scalar_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_array_scalar_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::value_type           , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_value_type     , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type , int >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::memory_space , typename ExecSpace::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::reference_type , int & >::value ) );
+
+    ASSERT_EQ( T::Rank , 1 );
+
+    ASSERT_TRUE( ( std::is_same< typename C::data_type           , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_data_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_data_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename C::array_scalar_type           , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_array_scalar_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_array_scalar_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename C::value_type           , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::const_value_type     , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::non_const_value_type , int >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename C::memory_space , typename ExecSpace::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename C::reference_type , const int & >::value ) );
+
+    ASSERT_EQ( C::Rank , 1 );
+
+    ASSERT_EQ( vr1.dimension_0() , N );
+
+    if ( Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename ExecSpace::memory_space , Kokkos::HostSpace >::value ) {
+      for ( int i = 0 ; i < N ; ++i ) data[i] = i + 1 ;
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 1 );
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( cr1[i] , i + 1 );
+
+      {
+        T tmp( vr1 );
+        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 1 );
+        for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 2 ;
+        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 2 );
+      }
+
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 2 );
+    }
+  }
+
+  {
+    constexpr int N = 10 ;
+    typedef Kokkos::Experimental::View<int*,ExecSpace>        T ;
+    typedef Kokkos::Experimental::View<const int*,ExecSpace>  C ;
+
+    T vr1("vr1",N);
+    C cr1(vr1);
+
+    ASSERT_TRUE( ( std::is_same< typename T::data_type           , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_data_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_data_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::array_scalar_type           , int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_array_scalar_type     , const int* >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_array_scalar_type , int* >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::value_type           , int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::const_value_type     , const int >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::non_const_value_type , int >::value ) );
+
+    ASSERT_TRUE( ( std::is_same< typename T::memory_space , typename ExecSpace::memory_space >::value ) );
+    ASSERT_TRUE( ( std::is_same< typename T::reference_type , int & >::value ) );
+    ASSERT_EQ( T::Rank , 1 );
+ 
+    ASSERT_EQ( vr1.dimension_0() , N );
+
+    if ( Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< typename ExecSpace::memory_space , Kokkos::HostSpace >::value ) {
+      for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 1 ;
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 1 );
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( cr1[i] , i + 1 );
+
+      {
+        T tmp( vr1 );
+        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 1 );
+        for ( int i = 0 ; i < N ; ++i ) vr1(i) = i + 2 ;
+        for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( tmp[i] , i + 2 );
+      }
+
+      for ( int i = 0 ; i < N ; ++i ) ASSERT_EQ( vr1[i] , i + 2 );
+    }
+  }
+
+  {
+    using namespace Kokkos::Experimental ;
+
+    typedef typename ExecSpace::memory_space  memory_space ;
+    typedef View<int*,memory_space>           V ;
+
+    constexpr int N = 10 ;
+
+    memory_space mem_space ;
+
+    V v( "v" , N );
+    V va( view_alloc() , N );
+    V vb( view_alloc( "vb" ) , N );
+    V vc( view_alloc( "vc" , AllowPadding ) , N );
+    V vd( view_alloc( "vd" , WithoutInitializing ) , N );
+    V ve( view_alloc( "ve" , WithoutInitializing , AllowPadding ) , N );
+    V vf( view_alloc( "vf" , mem_space , WithoutInitializing , AllowPadding ) , N );
+    V vg( view_alloc( mem_space , "vg" , WithoutInitializing , AllowPadding ) , N );
+    V vh( view_alloc( WithoutInitializing , AllowPadding ) , N );
+    V vi( view_alloc( WithoutInitializing ) , N );
+    V vj( view_alloc( std::string("vj") , AllowPadding ) , N );
+    V vk( view_alloc( mem_space , std::string("vk") , AllowPadding ) , N );
+  }
+
+  {
+    typedef Kokkos::Experimental::ViewTraits<int***,Kokkos::LayoutStride,ExecSpace>  traits_t ;
+    typedef Kokkos::Experimental::Impl::ViewDimension<0,0,0>                         dims_t ;
+    typedef Kokkos::Experimental::Impl::ViewOffset< dims_t , Kokkos::LayoutStride >  offset_t ;
+
+    Kokkos::LayoutStride stride ;
+
+    stride.dimension[0] = 3 ;
+    stride.dimension[1] = 4 ;
+    stride.dimension[2] = 5 ;
+    stride.stride[0] = 4 ;
+    stride.stride[1] = 1 ;
+    stride.stride[2] = 12 ;
+
+    const offset_t offset( stride );
+
+    ASSERT_EQ( offset.dimension_0() , 3 );
+    ASSERT_EQ( offset.dimension_1() , 4 );
+    ASSERT_EQ( offset.dimension_2() , 5 );
+
+    ASSERT_EQ( offset.stride_0() , 4 );
+    ASSERT_EQ( offset.stride_1() , 1 );
+    ASSERT_EQ( offset.stride_2() , 12 );
+
+    ASSERT_EQ( offset.span() , 60 );
+    ASSERT_TRUE( offset.span_is_contiguous() );
+
+    Kokkos::Experimental::Impl::ViewMapping< traits_t , void >  v( (int*) 0 , std::false_type() , stride );
+  }
+
+  {
+    typedef Kokkos::Experimental::View<int**,ExecSpace>  V ;
+    typedef typename V::HostMirror  M ;
+
+    constexpr int N0 = 10 ;
+    constexpr int N1 = 11 ;
+
+    V a("a",N0,N1);
+    M b = Kokkos::Experimental::create_mirror(a);
+    M c = Kokkos::Experimental::create_mirror_view(a);
+
+    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
+    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
+      b(i0,i1) = 1 + i0 + i1 * N0 ;
+
+    Kokkos::Experimental::deep_copy( a , b );
+    Kokkos::Experimental::deep_copy( c , a );
+
+    for ( int i0 = 0 ; i0 < N0 ; ++i0 )
+    for ( int i1 = 0 ; i1 < N1 ; ++i1 )
+      ASSERT_EQ( b(i0,i1) , c(i0,i1) );
+
+    Kokkos::Experimental::resize( b , 5 , 6 );
+    Kokkos::Experimental::realloc( c , 5 , 6 );
+
+    ASSERT_EQ( b.dimension_0() , 5 );
+    ASSERT_EQ( b.dimension_1() , 6 );
+    ASSERT_EQ( c.dimension_0() , 5 );
+    ASSERT_EQ( c.dimension_1() , 6 );
+  }
+}
+
+template< class ExecSpace >
+struct TestViewMappingSubview {
+
+  constexpr static int AN = 10 ;
+  typedef Kokkos::Experimental::View<int*,ExecSpace>  AT ;
+  typedef Kokkos::Experimental::Subview< AT , true >  AS ;
+
+  constexpr static int BN0 = 10 , BN1 = 11 , BN2 = 12 ;
+  typedef Kokkos::Experimental::View<int***,ExecSpace>  BT ;
+  typedef Kokkos::Experimental::Subview< BT , true , true , true >  BS ;
+
+  constexpr static int CN0 = 10 , CN1 = 11 , CN2 = 12 ;
+  typedef Kokkos::Experimental::View<int***[13][14],ExecSpace>  CT ;
+  typedef Kokkos::Experimental::Subview< CT , true , true , true , false , false >  CS ;
+
+  constexpr static int DN0 = 10 , DN1 = 11 , DN2 = 12 ;
+  typedef Kokkos::Experimental::View<int***[13][14],ExecSpace>  DT ;
+  typedef Kokkos::Experimental::Subview< DT , false , true , true , true , false >  DS ;
+
+
+  typedef Kokkos::Experimental::View<int***[13][14],Kokkos::LayoutLeft,ExecSpace>  DLT ;
+  typedef Kokkos::Experimental::Subview< DLT , true , false , false , false , false >  DLS1 ;
+
+  static_assert( DLS1::rank == 1 && std::is_same< typename DLS1::array_layout , Kokkos::LayoutLeft >::value
+               , "Subview layout error for rank 1 subview of left-most range of LayoutLeft" );
+
+  typedef Kokkos::Experimental::View<int***[13][14],Kokkos::LayoutRight,ExecSpace>  DRT ;
+  typedef Kokkos::Experimental::Subview< DRT , false , false , false , false , true >  DRS1 ;
+
+  static_assert( DRS1::rank == 1 && std::is_same< typename DRS1::array_layout , Kokkos::LayoutRight >::value
+               , "Subview layout error for rank 1 subview of right-most range of LayoutRight" );
+
+  AT Aa ;
+  AS Ab ;
+  BT Ba ;
+  BS Bb ;
+  CT Ca ;
+  CS Cb ;
+  DT Da ;
+  DS Db ;
+
+  TestViewMappingSubview()
+    : Aa("Aa",AN)
+    , Ab( Kokkos::Experimental::subview( Aa , std::pair<int,int>(1,AN-1) ) )
+    , Ba("Ba",BN0,BN1,BN2)
+    , Bb( Kokkos::Experimental::subview( Ba
+                                        , std::pair<int,int>(1,BN0-1)
+                                        , std::pair<int,int>(1,BN1-1)
+                                        , std::pair<int,int>(1,BN2-1)
+                                        ) )
+    , Ca("Ca",CN0,CN1,CN2)
+    , Cb( Kokkos::Experimental::subview( Ca
+                                        , std::pair<int,int>(1,CN0-1)
+                                        , std::pair<int,int>(1,CN1-1)
+                                        , std::pair<int,int>(1,CN2-1)
+                                        , 1
+                                        , 2
+                                        ) )
+    , Da("Da",DN0,DN1,DN2)
+    , Db( Kokkos::Experimental::subview( Da
+                                        , 1
+                                        , std::pair<int,int>(1,DN0-1)
+                                        , std::pair<int,int>(1,DN1-1)
+                                        , std::pair<int,int>(1,DN2-1)
+                                        , 2
+                                        ) )
+    {
+    }
+
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const int , long & error_count ) const
+    {
+      for ( int i = 1 ; i < AN-1 ; ++i ) if( & Aa[i] != & Ab[i-1] ) ++error_count ;
+
+      for ( int i2 = 1 ; i2 < BN2-1 ; ++i2 ) {
+      for ( int i1 = 1 ; i1 < BN1-1 ; ++i1 ) {
+      for ( int i0 = 1 ; i0 < BN0-1 ; ++i0 ) {
+        if ( & Ba(i0,i1,i2) != & Bb(i0-1,i1-1,i2-1) ) ++error_count ;
+      }}}
+
+      for ( int i2 = 1 ; i2 < CN2-1 ; ++i2 ) {
+      for ( int i1 = 1 ; i1 < CN1-1 ; ++i1 ) {
+      for ( int i0 = 1 ; i0 < CN0-1 ; ++i0 ) {
+        if ( & Ca(i0,i1,i2,1,2) != & Cb(i0-1,i1-1,i2-1) ) ++error_count ;
+      }}}
+
+      for ( int i2 = 1 ; i2 < DN2-1 ; ++i2 ) {
+      for ( int i1 = 1 ; i1 < DN1-1 ; ++i1 ) {
+      for ( int i0 = 1 ; i0 < DN0-1 ; ++i0 ) {
+        if ( & Da(1,i0,i1,i2,2) != & Db(i0-1,i1-1,i2-1) ) ++error_count ;
+      }}}
+    }
+
+  static void run()
+  {
+    TestViewMappingSubview self ;
+
+    ASSERT_EQ( self.Da.stride_1() , self.Db.stride_0() );
+    ASSERT_EQ( self.Da.stride_2() , self.Db.stride_1() );
+    ASSERT_EQ( self.Da.stride_3() , self.Db.stride_2() );
+
+    long error_count = -1 ;
+    Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >(0,1) , self , error_count );
+    ASSERT_EQ( error_count , 0 );
+  }
+
+};
+
+template< class ExecSpace >
+void test_view_mapping_subview()
+{
+  TestViewMappingSubview< ExecSpace >::run();
+}
+
+/*--------------------------------------------------------------------------*/
+
+template< class ViewType >
+struct TestViewMapOperator {
+
+  static_assert( ViewType::reference_type_is_lvalue_reference
+               , "Test only valid for lvalue reference type" );
+
+  const ViewType v ;
+
+  KOKKOS_INLINE_FUNCTION
+  void test_left( size_t i0 , long & error_count ) const
+    {
+      typename ViewType::value_type * const base_ptr = & v(0,0,0,0,0,0,0,0);
+      const size_t n1 = v.dimension_1();
+      const size_t n2 = v.dimension_2();
+      const size_t n3 = v.dimension_3();
+      const size_t n4 = v.dimension_4();
+      const size_t n5 = v.dimension_5();
+      const size_t n6 = v.dimension_6();
+      const size_t n7 = v.dimension_7();
+
+      long offset = 0 ;
+
+      for ( size_t i7 = 0 ; i7 < n7 ; ++i7 )
+      for ( size_t i6 = 0 ; i6 < n6 ; ++i6 )
+      for ( size_t i5 = 0 ; i5 < n5 ; ++i5 )
+      for ( size_t i4 = 0 ; i4 < n4 ; ++i4 )
+      for ( size_t i3 = 0 ; i3 < n3 ; ++i3 )
+      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 )
+      for ( size_t i1 = 0 ; i1 < n1 ; ++i1 )
+      {
+        const long d = & v(i0,i1,i2,i3,i4,i5,i6,i7) - base_ptr ;
+        if ( d < offset ) ++error_count ;
+        offset = d ;
+      }
+
+      if ( v.span() <= size_t(offset) ) ++error_count ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void test_right( size_t i0 , long & error_count ) const
+    {
+      typename ViewType::value_type * const base_ptr = & v(0,0,0,0,0,0,0,0);
+      const size_t n1 = v.dimension_1();
+      const size_t n2 = v.dimension_2();
+      const size_t n3 = v.dimension_3();
+      const size_t n4 = v.dimension_4();
+      const size_t n5 = v.dimension_5();
+      const size_t n6 = v.dimension_6();
+      const size_t n7 = v.dimension_7();
+
+      long offset = 0 ;
+
+      for ( size_t i1 = 0 ; i1 < n1 ; ++i1 )
+      for ( size_t i2 = 0 ; i2 < n2 ; ++i2 )
+      for ( size_t i3 = 0 ; i3 < n3 ; ++i3 )
+      for ( size_t i4 = 0 ; i4 < n4 ; ++i4 )
+      for ( size_t i5 = 0 ; i5 < n5 ; ++i5 )
+      for ( size_t i6 = 0 ; i6 < n6 ; ++i6 )
+      for ( size_t i7 = 0 ; i7 < n7 ; ++i7 )
+      {
+        const long d = & v(i0,i1,i2,i3,i4,i5,i6,i7) - base_ptr ;
+        if ( d < offset ) ++error_count ;
+        offset = d ;
+      }
+
+      if ( v.span() <= size_t(offset) ) ++error_count ;
+    }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( size_t i , long & error_count ) const
+    {
+      if ( std::is_same< typename ViewType::array_layout , Kokkos::LayoutLeft >::value )
+        test_left(i,error_count);
+      else if ( std::is_same< typename ViewType::array_layout , Kokkos::LayoutRight >::value )
+        test_right(i,error_count);
+    }
+
+  constexpr static size_t N0 = 10 ;
+  constexpr static size_t N1 =  9 ;
+  constexpr static size_t N2 =  8 ;
+  constexpr static size_t N3 =  7 ;
+  constexpr static size_t N4 =  6 ;
+  constexpr static size_t N5 =  5 ;
+  constexpr static size_t N6 =  4 ;
+  constexpr static size_t N7 =  3 ;
+
+  TestViewMapOperator() : v( "Test" , N0, N1, N2, N3, N4, N5, N6, N7 ) {}
+
+  static void run()
+    {
+      TestViewMapOperator self ;
+
+      ASSERT_EQ( self.v.dimension_0() , ( 0 < ViewType::rank ? N0 : 1 ) );
+      ASSERT_EQ( self.v.dimension_1() , ( 1 < ViewType::rank ? N1 : 1 ) );
+      ASSERT_EQ( self.v.dimension_2() , ( 2 < ViewType::rank ? N2 : 1 ) );
+      ASSERT_EQ( self.v.dimension_3() , ( 3 < ViewType::rank ? N3 : 1 ) );
+      ASSERT_EQ( self.v.dimension_4() , ( 4 < ViewType::rank ? N4 : 1 ) );
+      ASSERT_EQ( self.v.dimension_5() , ( 5 < ViewType::rank ? N5 : 1 ) );
+      ASSERT_EQ( self.v.dimension_6() , ( 6 < ViewType::rank ? N6 : 1 ) );
+      ASSERT_EQ( self.v.dimension_7() , ( 7 < ViewType::rank ? N7 : 1 ) );
+
+      ASSERT_LE( self.v.dimension_0()*
+                 self.v.dimension_1()*
+                 self.v.dimension_2()*
+                 self.v.dimension_3()*
+                 self.v.dimension_4()*
+                 self.v.dimension_5()*
+                 self.v.dimension_6()*
+                 self.v.dimension_7()
+               , self.v.span() );
+
+      long error_count ;
+      Kokkos::RangePolicy< typename ViewType::execution_space > range(0,self.v.dimension_0());
+      Kokkos::parallel_reduce( range , self , error_count );
+      ASSERT_EQ( 0 , error_count );
+    }
+};
+
+
+template< class ExecSpace >
+void test_view_mapping_operator()
+{
+  TestViewMapOperator< Kokkos::Experimental::View<int,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int**,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int***,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int****,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*****,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int******,Kokkos::LayoutLeft,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*******,Kokkos::LayoutLeft,ExecSpace> >::run();
+
+  TestViewMapOperator< Kokkos::Experimental::View<int,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int**,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int***,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int****,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*****,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int******,Kokkos::LayoutRight,ExecSpace> >::run();
+  TestViewMapOperator< Kokkos::Experimental::View<int*******,Kokkos::LayoutRight,ExecSpace> >::run();
+}
+
+/*--------------------------------------------------------------------------*/
+
+template< class ExecSpace >
+struct TestViewMappingAtomic {
+  typedef Kokkos::MemoryTraits< Kokkos::Atomic >  mem_trait ;
+
+  typedef Kokkos::Experimental::View< int * , ExecSpace > T ;
+  typedef Kokkos::Experimental::View< int * , ExecSpace , mem_trait >  T_atom ;
+
+  T      x ;
+  T_atom x_atom ;
+
+  constexpr static size_t N = 100000 ;
+
+  struct TagInit {};
+  struct TagUpdate {};
+  struct TagVerify {};
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagInit & , const int i ) const
+    { x(i) = i ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagUpdate & , const int i ) const
+    { x_atom(i%2) += 1 ; }
+
+  KOKKOS_INLINE_FUNCTION
+  void operator()( const TagVerify & , const int i , long & error_count ) const
+    {
+       if ( i < 2 ) { if ( x(i) != int(i + N / 2) ) ++error_count ; }
+       else         { if ( x(i) != int(i) ) ++error_count ; }
+    }
+
+  TestViewMappingAtomic()
+    : x("x",N)
+    , x_atom( x )
+    {}
+
+  static void run()
+    {
+      ASSERT_TRUE( T::reference_type_is_lvalue_reference );
+      ASSERT_FALSE( T_atom::reference_type_is_lvalue_reference );
+
+      TestViewMappingAtomic self ;
+      Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace , TagInit >(0,N) , self );
+      Kokkos::parallel_for( Kokkos::RangePolicy< ExecSpace , TagUpdate >(0,N) , self );
+      long error_count = -1 ;
+      Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , TagVerify >(0,N) , self , error_count );
+      ASSERT_EQ( 0 , error_count );
+    }
+};
+
+
+} /* namespace Test */
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestViewOfClass.hpp b/lib/kokkos/core/unit_test/TestViewOfClass.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..09abacd80de10950f94866a1b0ad368bc9527ce7
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewOfClass.hpp
@@ -0,0 +1,126 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+/*--------------------------------------------------------------------------*/
+
+namespace Test {
+
+namespace {
+volatile int nested_view_count ;
+}
+
+template< class Space >
+class NestedView {
+private:
+  Kokkos::View<int*,Space> member ;
+
+public:
+
+  KOKKOS_INLINE_FUNCTION
+  NestedView()
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    : member("member",2)
+    { Kokkos::atomic_increment( & nested_view_count ); }
+#else
+    : member(){}
+#endif
+
+  ~NestedView()
+#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
+    { Kokkos::atomic_decrement( & nested_view_count ); }
+#else
+    {}
+#endif
+
+};
+
+
+template< class Space >
+void view_nested_view()
+{
+  ASSERT_EQ( 0 , nested_view_count );
+  {
+    Kokkos::View< NestedView<Space> * , Space > a("a_nested_view",2);
+    ASSERT_EQ( 2 , nested_view_count );
+    Kokkos::View< NestedView<Space> * , Space > b("b_nested_view",2);
+    ASSERT_EQ( 4 , nested_view_count );
+  }
+  // ASSERT_EQ( 0 , nested_view_count );
+}
+
+}
+
+namespace Kokkos {
+namespace Impl {
+
+template< class ExecSpace , class S >
+struct ViewDefaultConstruct< ExecSpace , Test::NestedView<S> , true >
+{
+  typedef Test::NestedView<S> type ;
+  type * const m_ptr ;
+
+  KOKKOS_FORCEINLINE_FUNCTION
+  void operator()( const typename ExecSpace::size_type& i ) const
+    { new(m_ptr+i) type(); }
+
+  ViewDefaultConstruct( type * pointer , size_t capacity )
+    : m_ptr( pointer )
+    {
+      Kokkos::RangePolicy< ExecSpace > range( 0 , capacity );
+      parallel_for( range , *this );
+      ExecSpace::fence();
+    }
+};
+
+} // namespace Impl
+} // namespace Kokkos
+
+/*--------------------------------------------------------------------------*/
+
diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp
new file mode 100755
index 0000000000000000000000000000000000000000..8bf201fb47c41f0d3d2da2007057c5ef2aa54f23
--- /dev/null
+++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp
@@ -0,0 +1,632 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+#include <Kokkos_Core.hpp>
+#include <stdexcept>
+#include <sstream>
+#include <iostream>
+
+/*--------------------------------------------------------------------------*/
+
+namespace TestViewSubview {
+
+#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
+
+using Kokkos::Experimental::ALL ;
+
+#else
+
+namespace {
+
+const Kokkos::ALL ALL ;
+
+}
+
+#endif
+
+template<class Layout, class Space>
+struct getView {
+  static
+    Kokkos::View<double**,Layout,Space> get(int n, int m) {
+      return Kokkos::View<double**,Layout,Space>("G",n,m);
+  }
+};
+
+template<class Space>
+struct getView<Kokkos::LayoutStride,Space> {
+  static
+    Kokkos::View<double**,Kokkos::LayoutStride,Space> get(int n, int m) {
+      const int rank = 2 ;
+      const int order[] = { 0, 1 };
+      const unsigned dim[] = { unsigned(n), unsigned(m) };
+      Kokkos::LayoutStride stride = Kokkos::LayoutStride::order_dimensions( rank , order , dim );
+      return Kokkos::View<double**,Kokkos::LayoutStride,Space>("G",stride);
+  }
+};
+
+template<class ViewType, class Space>
+struct fill_1D {
+  typedef typename Space::execution_space execution_space;
+  typedef typename ViewType::size_type size_type;
+  ViewType a;
+  double val;
+  fill_1D(ViewType a_, double val_):a(a_),val(val_) {
+  }
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const {
+    a(i) = val;
+  }
+};
+
+template<class ViewType, class Space>
+struct fill_2D {
+  typedef typename Space::execution_space execution_space;
+  typedef typename ViewType::size_type size_type;
+  ViewType a;
+  double val;
+  fill_2D(ViewType a_, double val_):a(a_),val(val_) {
+  }
+  KOKKOS_INLINE_FUNCTION
+  void operator() (const int i) const{
+    for(int j = 0; j < static_cast<int>(a.dimension_1()); j++)
+      a(i,j) = val;
+  }
+};
+
+template<class Layout, class Space>
+void test_auto_1d ()
+{
+  typedef Kokkos::View<double**, Layout, Space> mv_type;
+  typedef typename mv_type::size_type size_type;
+  const double ZERO = 0.0;
+  const double ONE = 1.0;
+  const double TWO = 2.0;
+
+  const size_type numRows = 10;
+  const size_type numCols = 3;
+
+  mv_type X = getView<Layout,Space>::get(numRows, numCols);
+  typename mv_type::HostMirror X_h = Kokkos::create_mirror_view (X);
+
+  fill_2D<mv_type,Space> f1(X, ONE);
+  Kokkos::parallel_for(X.dimension_0(),f1);
+  Kokkos::deep_copy (X_h, X);
+  for (size_type j = 0; j < numCols; ++j) {
+    for (size_type i = 0; i < numRows; ++i) {
+      ASSERT_TRUE(X_h(i,j) == ONE);
+    }
+  }
+
+  fill_2D<mv_type,Space> f2(X, 0.0);
+  Kokkos::parallel_for(X.dimension_0(),f2);
+  Kokkos::deep_copy (X_h, X);
+  for (size_type j = 0; j < numCols; ++j) {
+    for (size_type i = 0; i < numRows; ++i) {
+      ASSERT_TRUE(X_h(i,j) == ZERO);
+    }
+  }
+
+  fill_2D<mv_type,Space> f3(X, TWO);
+  Kokkos::parallel_for(X.dimension_0(),f3);
+  Kokkos::deep_copy (X_h, X);
+  for (size_type j = 0; j < numCols; ++j) {
+    for (size_type i = 0; i < numRows; ++i) {
+      ASSERT_TRUE(X_h(i,j) == TWO);
+    }
+  }
+
+  for (size_type j = 0; j < numCols; ++j) {
+    auto X_j = Kokkos::subview (X, TestViewSubview::ALL, j);
+
+    fill_1D<decltype(X_j),Space> f4(X_j, ZERO);
+    Kokkos::parallel_for(X_j.dimension_0(),f4);
+    Kokkos::deep_copy (X_h, X);
+    for (size_type i = 0; i < numRows; ++i) {
+      ASSERT_TRUE(X_h(i,j) == ZERO);
+    }
+
+    for (size_type jj = 0; jj < numCols; ++jj) {
+      auto X_jj = Kokkos::subview (X, TestViewSubview::ALL, jj);
+      fill_1D<decltype(X_jj),Space> f5(X_jj, ONE);
+      Kokkos::parallel_for(X_jj.dimension_0(),f5);
+      Kokkos::deep_copy (X_h, X);
+      for (size_type i = 0; i < numRows; ++i) {
+        ASSERT_TRUE(X_h(i,jj) == ONE);
+      }
+    }
+  }
+}
+
+template<class LD, class LS, class Space>
+void test_1d_strided_assignment_impl(bool a, bool b, bool c, bool d, int n, int m) {
+  Kokkos::View<double**,LS,Space> l2d("l2d",n,m);
+
+  int col = n>2?2:0;
+  int row = m>2?2:0;
+
+  if(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace<Kokkos::HostSpace,Space>::value) {
+  if(a) {
+    Kokkos::View<double*,LD,Space> l1da = Kokkos::subview(l2d,TestViewSubview::ALL,row);
+    ASSERT_TRUE( & l1da(0) == & l2d(0,row) );
+    if(n>1)
+      ASSERT_TRUE( & l1da(1) == & l2d(1,row) );
+  }
+  if(b && n>13) {
+    Kokkos::View<double*,LD,Space> l1db = Kokkos::subview(l2d,std::pair<unsigned,unsigned>(2,13),row);
+    ASSERT_TRUE( & l1db(0) == & l2d(2,row) );
+    ASSERT_TRUE( & l1db(1) == & l2d(3,row) );
+  }
+  if(c) {
+    Kokkos::View<double*,LD,Space> l1dc = Kokkos::subview(l2d,col,TestViewSubview::ALL);
+    ASSERT_TRUE( & l1dc(0) == & l2d(col,0) );
+    if(m>1)
+      ASSERT_TRUE( & l1dc(1) == & l2d(col,1) );
+  }
+  if(d && m>13) {
+    Kokkos::View<double*,LD,Space> l1dd = Kokkos::subview(l2d,col,std::pair<unsigned,unsigned>(2,13));
+    ASSERT_TRUE( & l1dd(0) == & l2d(col,2) );
+    ASSERT_TRUE( & l1dd(1) == & l2d(col,3) );
+  }
+  }
+
+}
+
+template<class Space >
+void test_1d_strided_assignment() {
+  test_1d_strided_assignment_impl<Kokkos::LayoutStride,Kokkos::LayoutLeft,Space>(true,true,true,true,17,3);
+  test_1d_strided_assignment_impl<Kokkos::LayoutStride,Kokkos::LayoutRight,Space>(true,true,true,true,17,3);
+
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,false,false,17,3);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,false,false,17,3);
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(false,false,true,true,17,3);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(false,false,true,true,17,3);
+
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,false,false,17,1);
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutLeft,Space>(true,true,true,true,1,17);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,true,true,1,17);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutLeft,Space>(true,true,false,false,17,1);
+
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(true,true,true,true,17,1);
+  test_1d_strided_assignment_impl<Kokkos::LayoutLeft,Kokkos::LayoutRight,Space>(false,false,true,true,1,17);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(false,false,true,true,1,17);
+  test_1d_strided_assignment_impl<Kokkos::LayoutRight,Kokkos::LayoutRight,Space>(true,true,true,true,17,1);
+}
+
+template< class Space >
+void test_left_0()
+{
+  typedef Kokkos::View< int [2][3][4][5][2][3][4][5] , Kokkos::LayoutLeft , Space >
+    view_static_8_type ;
+
+  view_static_8_type  x_static_8("x_static_left_8");
+
+  ASSERT_TRUE( x_static_8.is_contiguous() );
+
+  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x_static_8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+
+  ASSERT_TRUE( x0.is_contiguous() );
+  ASSERT_TRUE( & x0() == & x_static_8(0,0,0,0,0,0,0,0) );
+
+  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
+    Kokkos::subview( x_static_8, Kokkos::pair<int,int>(0,2), 1, 2, 3, 0, 1, 2, 3 );
+
+  ASSERT_TRUE( x1.is_contiguous() );
+  ASSERT_TRUE( & x1(0) == & x_static_8(0,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x1(1) == & x_static_8(1,1,2,3,0,1,2,3) );
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
+    Kokkos::subview( x_static_8, Kokkos::pair<int,int>(0,2), 1, 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( ! x2.is_contiguous() );
+  ASSERT_TRUE( & x2(0,0) == & x_static_8(0,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x2(0,1) == & x_static_8(0,1,2,3,1,1,2,3) );
+  ASSERT_TRUE( & x2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
+
+  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
+  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
+    Kokkos::subview( x_static_8, 1, Kokkos::pair<int,int>(0,2), 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( ! sx2.is_contiguous() );
+  ASSERT_TRUE( & sx2(0,0) == & x_static_8(1,0,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(0,1) == & x_static_8(1,0,2,3,1,1,2,3) );
+  ASSERT_TRUE( & sx2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
+
+  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
+    Kokkos::subview( x_static_8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
+                               , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
+                               , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
+                               , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
+                   );
+
+  ASSERT_TRUE( ! sx4.is_contiguous() );
+
+  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
+  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
+  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
+  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
+    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x_static_8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
+  }
+}
+
+template< class Space >
+void test_left_1()
+{
+  typedef Kokkos::View< int ****[2][3][4][5] , Kokkos::LayoutLeft , Space >
+    view_type ;
+
+  view_type  x8("x_left_8",2,3,4,5);
+
+  ASSERT_TRUE( x8.is_contiguous() );
+
+  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+
+  ASSERT_TRUE( x0.is_contiguous() );
+  ASSERT_TRUE( & x0() == & x8(0,0,0,0,0,0,0,0) );
+
+  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
+    Kokkos::subview( x8, Kokkos::pair<int,int>(0,2), 1, 2, 3, 0, 1, 2, 3 );
+
+  ASSERT_TRUE( x1.is_contiguous() );
+  ASSERT_TRUE( & x1(0) == & x8(0,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x1(1) == & x8(1,1,2,3,0,1,2,3) );
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
+    Kokkos::subview( x8, Kokkos::pair<int,int>(0,2), 1, 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( ! x2.is_contiguous() );
+  ASSERT_TRUE( & x2(0,0) == & x8(0,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x2(1,0) == & x8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & x2(0,1) == & x8(0,1,2,3,1,1,2,3) );
+  ASSERT_TRUE( & x2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+
+  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
+  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
+    Kokkos::subview( x8, 1, Kokkos::pair<int,int>(0,2), 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( ! sx2.is_contiguous() );
+  ASSERT_TRUE( & sx2(0,0) == & x8(1,0,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(1,0) == & x8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(0,1) == & x8(1,0,2,3,1,1,2,3) );
+  ASSERT_TRUE( & sx2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+
+  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
+    Kokkos::subview( x8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
+                       , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
+                       , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
+                       , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
+                   );
+
+  ASSERT_TRUE( ! sx4.is_contiguous() );
+
+  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
+  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
+  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
+  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
+    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
+  }
+}
+
+template< class Space >
+void test_left_2()
+{
+  typedef Kokkos::View< int **** , Kokkos::LayoutLeft , Space > view_type ;
+
+  view_type  x4("x4",2,3,4,5);
+
+  ASSERT_TRUE( x4.is_contiguous() );
+
+  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( x4 , 0, 0, 0, 0 );
+
+  ASSERT_TRUE( x0.is_contiguous() );
+  ASSERT_TRUE( & x0() == & x4(0,0,0,0) );
+
+  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
+    Kokkos::subview( x4, Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( x1.is_contiguous() );
+  ASSERT_TRUE( & x1(0) == & x4(0,1,2,3) );
+  ASSERT_TRUE( & x1(1) == & x4(1,1,2,3) );
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
+    Kokkos::subview( x4, Kokkos::pair<int,int>(0,2), 1, Kokkos::pair<int,int>(1,3), 2 );
+
+  ASSERT_TRUE( ! x2.is_contiguous() );
+  ASSERT_TRUE( & x2(0,0) == & x4(0,1,1,2) );
+  ASSERT_TRUE( & x2(1,0) == & x4(1,1,1,2) );
+  ASSERT_TRUE( & x2(0,1) == & x4(0,1,2,2) );
+  ASSERT_TRUE( & x2(1,1) == & x4(1,1,2,2) );
+
+  // Kokkos::View<int**,Kokkos::LayoutLeft,Space> error_2 =
+  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
+    Kokkos::subview( x4, 1, Kokkos::pair<int,int>(0,2)
+                       , 2, Kokkos::pair<int,int>(1,4) );
+
+  ASSERT_TRUE( ! sx2.is_contiguous() );
+  ASSERT_TRUE( & sx2(0,0) == & x4(1,0,2,1) );
+  ASSERT_TRUE( & sx2(1,0) == & x4(1,1,2,1) );
+  ASSERT_TRUE( & sx2(0,1) == & x4(1,0,2,2) );
+  ASSERT_TRUE( & sx2(1,1) == & x4(1,1,2,2) );
+  ASSERT_TRUE( & sx2(0,2) == & x4(1,0,2,3) );
+  ASSERT_TRUE( & sx2(1,2) == & x4(1,1,2,3) );
+
+  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
+    Kokkos::subview( x4, Kokkos::pair<int,int>(1,2) /* of [2] */
+                       , Kokkos::pair<int,int>(1,3) /* of [3] */
+                       , Kokkos::pair<int,int>(0,4) /* of [4] */
+                       , Kokkos::pair<int,int>(2,4) /* of [5] */
+                   );
+
+  ASSERT_TRUE( ! sx4.is_contiguous() );
+
+  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
+  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
+  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
+  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
+    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x4( 1+i0, 1+i1, 0+i2, 2+i3 ) );
+  }
+}
+
+template< class Space >
+void test_left_3()
+{
+  typedef Kokkos::View< int ** , Kokkos::LayoutLeft , Space > view_type ;
+
+  view_type  xm("x4",10,5);
+
+  ASSERT_TRUE( xm.is_contiguous() );
+
+  Kokkos::View<int,Kokkos::LayoutLeft,Space> x0 = Kokkos::subview( xm , 5, 3 );
+
+  ASSERT_TRUE( x0.is_contiguous() );
+  ASSERT_TRUE( & x0() == & xm(5,3) );
+
+  Kokkos::View<int*,Kokkos::LayoutLeft,Space> x1 =
+    Kokkos::subview( xm, TestViewSubview::ALL, 3 );
+
+  ASSERT_TRUE( x1.is_contiguous() );
+  for ( int i = 0 ; i < int(xm.dimension_0()) ; ++i ) {
+    ASSERT_TRUE( & x1(i) == & xm(i,3) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2 =
+    Kokkos::subview( xm, Kokkos::pair<int,int>(1,9), TestViewSubview::ALL );
+
+  ASSERT_TRUE( ! x2.is_contiguous() );
+  for ( int j = 0 ; j < int(x2.dimension_1()) ; ++j )
+  for ( int i = 0 ; i < int(x2.dimension_0()) ; ++i ) {
+    ASSERT_TRUE( & x2(i,j) == & xm(1+i,j) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2c =
+    Kokkos::subview( xm, TestViewSubview::ALL, std::pair<int,int>(2,4) );
+
+  ASSERT_TRUE( x2c.is_contiguous() );
+  for ( int j = 0 ; j < int(x2c.dimension_1()) ; ++j )
+  for ( int i = 0 ; i < int(x2c.dimension_0()) ; ++i ) {
+    ASSERT_TRUE( & x2c(i,j) == & xm(i,2+j) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2_n1 =
+    Kokkos::subview( xm , std::pair<int,int>(1,1) , TestViewSubview::ALL );
+
+  ASSERT_TRUE( x2_n1.dimension_0() == 0 );
+  ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
+
+  Kokkos::View<int**,Kokkos::LayoutLeft,Space> x2_n2 =
+    Kokkos::subview( xm , TestViewSubview::ALL , std::pair<int,int>(1,1) );
+
+  ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
+  ASSERT_TRUE( x2_n2.dimension_1() == 0 );
+}
+
+//----------------------------------------------------------------------------
+
+template< class Space >
+void test_right_0()
+{
+  typedef Kokkos::View< int [2][3][4][5][2][3][4][5] , Kokkos::LayoutRight , Space >
+    view_static_8_type ;
+
+  view_static_8_type  x_static_8("x_static_right_8");
+
+  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( x_static_8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+
+  ASSERT_TRUE( & x0() == & x_static_8(0,0,0,0,0,0,0,0) );
+
+  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
+    Kokkos::subview( x_static_8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+
+  ASSERT_TRUE( & x1(0) == & x_static_8(0,1,2,3,0,1,2,1) );
+  ASSERT_TRUE( & x1(1) == & x_static_8(0,1,2,3,0,1,2,2) );
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
+    Kokkos::subview( x_static_8, 0, 1, 2, Kokkos::pair<int,int>(1,3)
+                               , 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+
+  ASSERT_TRUE( & x2(0,0) == & x_static_8(0,1,2,1,0,1,2,1) );
+  ASSERT_TRUE( & x2(1,0) == & x_static_8(0,1,2,2,0,1,2,1) );
+  ASSERT_TRUE( & x2(0,1) == & x_static_8(0,1,2,1,0,1,2,2) );
+  ASSERT_TRUE( & x2(1,1) == & x_static_8(0,1,2,2,0,1,2,2) );
+
+  // Kokkos::View<int**,Kokkos::LayoutRight,Space> error_2 =
+  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
+    Kokkos::subview( x_static_8, 1, Kokkos::pair<int,int>(0,2), 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( & sx2(0,0) == & x_static_8(1,0,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(1,0) == & x_static_8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(0,1) == & x_static_8(1,0,2,3,1,1,2,3) );
+  ASSERT_TRUE( & sx2(1,1) == & x_static_8(1,1,2,3,1,1,2,3) );
+
+  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
+    Kokkos::subview( x_static_8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
+                               , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
+                               , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
+                               , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
+                   );
+
+  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
+  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
+  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
+  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
+    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x_static_8(0, 0+i0, 1, 1+i1, 1, 0+i2, 2, 2+i3) );
+  }
+}
+
+template< class Space >
+void test_right_1()
+{
+  typedef Kokkos::View< int ****[2][3][4][5] , Kokkos::LayoutRight , Space >
+    view_type ;
+
+  view_type  x8("x_right_8",2,3,4,5);
+
+  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( x8 , 0, 0, 0, 0, 0, 0, 0, 0 );
+
+  ASSERT_TRUE( & x0() == & x8(0,0,0,0,0,0,0,0) );
+
+  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
+    Kokkos::subview( x8, 0, 1, 2, 3, 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+
+  ASSERT_TRUE( & x1(0) == & x8(0,1,2,3,0,1,2,1) );
+  ASSERT_TRUE( & x1(1) == & x8(0,1,2,3,0,1,2,2) );
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
+    Kokkos::subview( x8, 0, 1, 2, Kokkos::pair<int,int>(1,3)
+                               , 0, 1, 2, Kokkos::pair<int,int>(1,3) );
+
+  ASSERT_TRUE( & x2(0,0) == & x8(0,1,2,1,0,1,2,1) );
+  ASSERT_TRUE( & x2(1,0) == & x8(0,1,2,2,0,1,2,1) );
+  ASSERT_TRUE( & x2(0,1) == & x8(0,1,2,1,0,1,2,2) );
+  ASSERT_TRUE( & x2(1,1) == & x8(0,1,2,2,0,1,2,2) );
+
+  // Kokkos::View<int**,Kokkos::LayoutRight,Space> error_2 =
+  Kokkos::View<int**,Kokkos::LayoutStride,Space> sx2 =
+    Kokkos::subview( x8, 1, Kokkos::pair<int,int>(0,2), 2, 3
+                               , Kokkos::pair<int,int>(0,2), 1, 2, 3 );
+
+  ASSERT_TRUE( & sx2(0,0) == & x8(1,0,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(1,0) == & x8(1,1,2,3,0,1,2,3) );
+  ASSERT_TRUE( & sx2(0,1) == & x8(1,0,2,3,1,1,2,3) );
+  ASSERT_TRUE( & sx2(1,1) == & x8(1,1,2,3,1,1,2,3) );
+
+  Kokkos::View<int****,Kokkos::LayoutStride,Space> sx4 =
+    Kokkos::subview( x8, 0, Kokkos::pair<int,int>(0,2) /* of [3] */
+                       , 1, Kokkos::pair<int,int>(1,3) /* of [5] */
+                       , 1, Kokkos::pair<int,int>(0,2) /* of [3] */
+                       , 2, Kokkos::pair<int,int>(2,4) /* of [5] */
+                   );
+
+  for ( int i0 = 0 ; i0 < (int) sx4.dimension_0() ; ++i0 )
+  for ( int i1 = 0 ; i1 < (int) sx4.dimension_1() ; ++i1 )
+  for ( int i2 = 0 ; i2 < (int) sx4.dimension_2() ; ++i2 )
+  for ( int i3 = 0 ; i3 < (int) sx4.dimension_3() ; ++i3 ) {
+    ASSERT_TRUE( & sx4(i0,i1,i2,i3) == & x8(0,0+i0, 1,1+i1, 1,0+i2, 2,2+i3) );
+  }
+}
+
+template< class Space >
+void test_right_3()
+{
+  typedef Kokkos::View< int ** , Kokkos::LayoutRight , Space > view_type ;
+
+  view_type  xm("x4",10,5);
+
+  ASSERT_TRUE( xm.is_contiguous() );
+
+  Kokkos::View<int,Kokkos::LayoutRight,Space> x0 = Kokkos::subview( xm , 5, 3 );
+
+  ASSERT_TRUE( x0.is_contiguous() );
+  ASSERT_TRUE( & x0() == & xm(5,3) );
+
+  Kokkos::View<int*,Kokkos::LayoutRight,Space> x1 =
+    Kokkos::subview( xm, 3, TestViewSubview::ALL );
+
+  ASSERT_TRUE( x1.is_contiguous() );
+  for ( int i = 0 ; i < int(xm.dimension_1()) ; ++i ) {
+    ASSERT_TRUE( & x1(i) == & xm(3,i) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2c =
+    Kokkos::subview( xm, Kokkos::pair<int,int>(1,9), TestViewSubview::ALL );
+
+  ASSERT_TRUE( x2c.is_contiguous() );
+  for ( int j = 0 ; j < int(x2c.dimension_1()) ; ++j )
+  for ( int i = 0 ; i < int(x2c.dimension_0()) ; ++i ) {
+    ASSERT_TRUE( & x2c(i,j) == & xm(1+i,j) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2 =
+    Kokkos::subview( xm, TestViewSubview::ALL, std::pair<int,int>(2,4) );
+
+  ASSERT_TRUE( ! x2.is_contiguous() );
+  for ( int j = 0 ; j < int(x2.dimension_1()) ; ++j )
+  for ( int i = 0 ; i < int(x2.dimension_0()) ; ++i ) {
+    ASSERT_TRUE( & x2(i,j) == & xm(i,2+j) );
+  }
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2_n1 =
+    Kokkos::subview( xm , std::pair<int,int>(1,1) , TestViewSubview::ALL );
+
+  ASSERT_TRUE( x2_n1.dimension_0() == 0 );
+  ASSERT_TRUE( x2_n1.dimension_1() == xm.dimension_1() );
+
+  Kokkos::View<int**,Kokkos::LayoutRight,Space> x2_n2 =
+    Kokkos::subview( xm , TestViewSubview::ALL , std::pair<int,int>(1,1) );
+
+  ASSERT_TRUE( x2_n2.dimension_0() == xm.dimension_0() );
+  ASSERT_TRUE( x2_n2.dimension_1() == 0 );
+}
+
+//----------------------------------------------------------------------------
+
+}
+
diff --git a/lib/kokkos/core/unit_test/UnitTestMain.cpp b/lib/kokkos/core/unit_test/UnitTestMain.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..f952ab3db51028aff0a0ebfe313b2639e353ab87
--- /dev/null
+++ b/lib/kokkos/core/unit_test/UnitTestMain.cpp
@@ -0,0 +1,50 @@
+/*
+//@HEADER
+// ************************************************************************
+// 
+//                        Kokkos v. 2.0
+//              Copyright (2014) Sandia Corporation
+// 
+// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
+// the U.S. Government retains certain rights in this software.
+// 
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+// 1. Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//
+// 2. Redistributions in binary form must reproduce the above copyright
+// notice, this list of conditions and the following disclaimer in the
+// documentation and/or other materials provided with the distribution.
+//
+// 3. Neither the name of the Corporation nor the names of the
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
+// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
+// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+//
+// Questions? Contact  H. Carter Edwards (hcedwar@sandia.gov)
+// 
+// ************************************************************************
+//@HEADER
+*/
+
+#include <gtest/gtest.h>
+
+int main(int argc, char *argv[]) {
+  ::testing::InitGoogleTest(&argc,argv);
+  return RUN_ALL_TESTS();
+}
+
diff --git a/lib/kokkos/doc/Doxyfile b/lib/kokkos/doc/Doxyfile
new file mode 100755
index 0000000000000000000000000000000000000000..bc5c7486b27fc55ede35359b969af0a8008f960b
--- /dev/null
+++ b/lib/kokkos/doc/Doxyfile
@@ -0,0 +1,127 @@
+#
+# Include the global look and feel options
+#
+@INCLUDE               = ../../common/Doxyfile
+#
+# Package options
+#
+PROJECT_NAME           = "Kokkos Core Kernels Package"
+PROJECT_NUMBER         = "Version of the Day"
+OUTPUT_DIRECTORY       = .
+OUTPUT_LANGUAGE        = English
+
+EXTRACT_ALL            = NO
+EXTRACT_PRIVATE        = NO
+EXTRACT_STATIC         = YES
+HIDE_UNDOC_MEMBERS     = YES
+HIDE_UNDOC_CLASSES     = YES
+BRIEF_MEMBER_DESC      = YES
+REPEAT_BRIEF           = YES
+ALWAYS_DETAILED_SEC    = YES
+FULL_PATH_NAMES        = NO
+STRIP_FROM_PATH        = 
+INTERNAL_DOCS          = NO
+CLASS_DIAGRAMS         = YES
+SOURCE_BROWSER         = YES
+INLINE_SOURCES         = NO
+STRIP_CODE_COMMENTS    = YES
+REFERENCED_BY_RELATION = NO
+REFERENCES_RELATION    = NO
+CASE_SENSE_NAMES       = YES
+HIDE_SCOPE_NAMES       = NO
+VERBATIM_HEADERS       = YES
+SHOW_INCLUDE_FILES     = YES
+#JAVADOC_AUTOBRIEF      = YES
+INHERIT_DOCS           = YES
+INLINE_INHERITED_MEMB  = YES
+INLINE_INFO            = YES
+SORT_MEMBER_DOCS       = NO
+TAB_SIZE               = 2
+ENABLED_SECTIONS       = 
+SORT_BRIEF_DOCS        = NO
+GENERATE_TODOLIST      = YES
+GENERATE_TESTLIST      = YES
+QUIET                  = NO
+WARNINGS               = YES
+WARN_IF_UNDOCUMENTED   = YES
+WARN_FORMAT            = "$file:$line: $text"
+
+#
+# INPUT: Where to find files that Doxygen should process.  ../classic
+# has a doc/ subdirectory with its own Doxyfile that points to its own
+# files.  The other Kokkos subpackages don't currently have their own
+# Doxyfile files, so we have to do it manually here.
+#
+# mfh 26 Sep 2013: I've only added those directories in the Core
+# subpackage that constitute the "public interface" of that
+# subpackage.  Please feel free to include additional subdirectories
+# of ../core if you want to generate their documentation as well.
+#
+# mfh 26 Sep 2013: I've only added the Kokkos subpackages here that I
+# think are ready for Doxygen documentation generation.  Please feel
+# free to amend this list as you see fit.
+#
+
+INPUT                  = index.doc ../classic ../core/src ../containers/src ../linalg/src
+FILE_PATTERNS          = *.hpp *.cpp *.cuh *.cu
+RECURSIVE              = NO
+EXCLUDE_PATTERNS       = *.x *.o *.out
+EXAMPLE_PATH           = 
+EXAMPLE_RECURSIVE       = YES
+EXAMPLE_PATTERNS       = *.cpp *.hpp
+IMAGE_PATH             = 
+INPUT_FILTER           = 
+ALPHABETICAL_INDEX     = YES
+COLS_IN_ALPHA_INDEX    = 4
+IGNORE_PREFIX          = 
+#
+# What diagrams are created
+#
+CLASS_GRAPH            = YES
+COLLABORATION_GRAPH    = NO
+INCLUDE_GRAPH          = NO
+INCLUDED_BY_GRAPH      = NO
+GRAPHICAL_HIERARCHY    = YES
+#
+# Preprocessing
+#
+ENABLE_PREPROCESSING   = YES
+MACRO_EXPANSION        = YES
+EXPAND_ONLY_PREDEF     = YES
+SEARCH_INCLUDES        = YES
+INCLUDE_FILE_PATTERNS  = 
+PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS DOXYGEN_USE_ONLY
+INCLUDE_PATH           = ../src
+EXPAND_AS_DEFINED      = 
+#
+# Links to other packages
+#
+TAGFILES               = ../../common/tag_files/teuchos.tag=../../../teuchos/doc/html ../../common/tag_files/epetra.tag=../../../epetra/doc/html \
+                         ../../common/tag_files/belos.tag=../../../belos/doc/html ../../common/tag_files/anasazi.tag=../../../anasazi/doc/html \
+                         ../../common/tag_files/kokkos.tag=../../../kokkos/doc/html 
+GENERATE_TAGFILE       = ../../common/tag_files/tpetra.tag
+ALLEXTERNALS           = NO
+EXTERNAL_GROUPS        = NO
+#
+# Environment
+#
+PERL_PATH              = /usr/bin/perl
+HAVE_DOT               = YES
+DOT_PATH               = 
+MAX_DOT_GRAPH_WIDTH    = 1024
+MAX_DOT_GRAPH_HEIGHT   = 1024
+#
+# What kind of documentation is generated
+#
+#GENERATE_HTML          = YES
+#HTML_OUTPUT            = html
+#HTML_HEADER            = includes/header.html
+#HTML_FOOTER            = includes/footer.html
+#HTML_STYLESHEET        = includes/stylesheet.css
+#HTML_ALIGN_MEMBERS     = YES
+GENERATE_HTMLHELP      = NO
+DISABLE_INDEX          = NO
+GENERATE_LATEX         = NO
+GENERATE_RTF           = NO
+GENERATE_MAN           = NO
+GENERATE_XML           = NO
diff --git a/lib/kokkos/doc/Kokkos_PG.pdf b/lib/kokkos/doc/Kokkos_PG.pdf
new file mode 100755
index 0000000000000000000000000000000000000000..3c415698c0d9fec315f317b71db19f2a019b6f6e
Binary files /dev/null and b/lib/kokkos/doc/Kokkos_PG.pdf differ
diff --git a/lib/kokkos/doc/README b/lib/kokkos/doc/README
new file mode 100755
index 0000000000000000000000000000000000000000..31e75f365c21a116a1fb736097f4f524e8d1e021
--- /dev/null
+++ b/lib/kokkos/doc/README
@@ -0,0 +1,32 @@
+Kokkos uses the Doxygen tool for providing three documentation
+sources:
+- man pages
+- Latex User Guide
+- HTML Online User Guide.
+
+Man Pages
+
+Man pages are available for all files and functions in the directory
+TRILINOS_HOME/doc/kokkos/man, where TRILINOS_HOME is the location of your
+copy of Trilinos.  To use these pages with the Unix man utility, add
+the directory to your man path as follows:
+
+setenv MANPATH `echo $MANPATH`:TRILINOS_HOME/doc/kokkos/man
+
+
+LaTeX User Guide
+
+A postscript version of this guide is in
+TRILINOS_HOME/doc/kokkos/latex/user_guide.ps.  The LaTeX source is in the
+directory TRILINOS_HOME/doc/kokkos/latex.
+
+HTML Online User Guide
+
+The online guide is initiated by pointing your browser to
+TRILINOS_HOME/doc/kokkos/html/index.html
+
+Any question, comments or suggestions are welcome.  Please send to
+Mike Heroux at 
+
+320-845-7695
+maherou@sandia.gov
diff --git a/lib/kokkos/doc/build_docs b/lib/kokkos/doc/build_docs
new file mode 100755
index 0000000000000000000000000000000000000000..da1d3e4f6e061804b1fb2fe21b356b691494df5d
--- /dev/null
+++ b/lib/kokkos/doc/build_docs
@@ -0,0 +1,15 @@
+#!/bin/sh
+
+if [ $TRILINOS_HOME ]; then
+  echo "TRILINOS_HOME has already been set!"
+else
+  echo "TRILINOS_HOME has not been set.  Setting it!"
+  export TRILINOS_HOME=`pwd`/../../..
+fi
+
+echo
+echo "Generating main Kokkos doxygen documentation ..."
+echo
+
+doxygen Doxyfile
+
diff --git a/lib/kokkos/doc/index.doc b/lib/kokkos/doc/index.doc
new file mode 100755
index 0000000000000000000000000000000000000000..27a9e4f2e7b90e11bbcde7309e9bf1544e3b386f
--- /dev/null
+++ b/lib/kokkos/doc/index.doc
@@ -0,0 +1,72 @@
+/*! 
+\mainpage Trilinos/Kokkos: Shared-memory programming interface and computational kernels
+
+\section Kokkos_Intro Introduction
+
+The %Kokkos package has two main components.  The first, sometimes
+called "%Kokkos Array" or just "%Kokkos," implements a
+performance-portable shared-memory parallel programming model and data
+containers.  The second, called "%Kokkos Classic," consists of
+computational kernels that support the %Tpetra package.
+
+\section Kokkos_Kokkos The %Kokkos programming model
+
+%Kokkos implements a performance-portable shared-memory parallel
+programming model and data containers.  It lets you write an algorithm
+once, and just change a template parameter to get the optimal data
+layout for your hardware.  %Kokkos has back-ends for the following
+parallel programming models:
+
+- Kokkos::Threads: POSIX Threads (Pthreads)
+- Kokkos::OpenMP: OpenMP
+- Kokkos::Cuda: NVIDIA's CUDA programming model for graphics
+  processing units (GPUs)
+- Kokkos::Serial: No thread parallelism
+
+%Kokkos also has optimizations for shared-memory parallel systems with
+nonuniform memory access (NUMA).  Its containers can hold data of any
+primitive ("plain old") data type (and some aggregate types).  %Kokkos
+Array may be used as a stand-alone programming model.
+
+%Kokkos' parallel operations include the following:
+
+- parallel_for: a thread-parallel "for loop"
+- parallel_reduce: a thread-parallel reduction
+- parallel_scan: a thread-parallel prefix scan operation
+
+as well as expert-level platform-independent interfaces to thread
+"teams," per-team "shared memory," synchronization, and atomic update
+operations.
+
+%Kokkos' data containers include the following:
+
+- Kokkos::View: A multidimensional array suitable for thread-parallel
+  operations.  Its layout (e.g., row-major or column-major) is
+  optimized by default for the particular thread-parallel device.
+- Kokkos::Vector: A drop-in replacement for std::vector that eases
+  porting from standard sequential C++ data structures to %Kokkos'
+  parallel data structures.
+- Kokkos::UnorderedMap: A parallel lookup table comparable in
+  functionality to std::unordered_map.
+
+%Kokkos also uses the above basic containers to implement higher-level
+data structures, like sparse graphs and matrices.
+
+A good place to start learning about %Kokkos would be <a href="http://trilinos.sandia.gov/events/trilinos_user_group_2013/presentations/2013-11-TUG-Kokkos-Tutorial.pdf">these tutorial slides</a> from the 2013 Trilinos Users' Group meeting.
+
+\section Kokkos_Classic %Kokkos Classic
+
+"%Kokkos Classic" consists of computational kernels that support the
+%Tpetra package.  These kernels include sparse matrix-vector multiply,
+sparse triangular solve, Gauss-Seidel, and dense vector operations.
+They are templated on the type of objects (\c Scalar) on which they
+operate.  This component was not meant to be visible to users; it is
+an implementation detail of the %Tpetra distributed linear algebra
+package.  
+
+%Kokkos Classic also implements a shared-memory parallel programming
+model.  This inspired and preceded the %Kokkos programming model
+described in the previous section.  Users should consider the %Kokkos
+Classic programming model deprecated, and prefer the new %Kokkos
+programming model.
+*/
diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash
new file mode 100755
index 0000000000000000000000000000000000000000..2e595dcc1c9f333ba84e38442a90c120625c949c
--- /dev/null
+++ b/lib/kokkos/generate_makefile.bash
@@ -0,0 +1,204 @@
+#!/bin/bash
+
+KOKKOS_DEVICES=""
+
+while [[ $# > 0 ]]
+do
+key="$1"
+
+case $key in
+    --kokkos-path*)
+    KOKKOS_PATH="${key#*=}"
+    ;;
+    --prefix*)
+    PREFIX="${key#*=}"
+    ;;
+    --with-cuda)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
+    CUDA_PATH_NVCC=`which nvcc`
+    CUDA_PATH=${CUDA_PATH_NVCC%/bin/nvcc}
+    ;;
+    --with-cuda*)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},Cuda"
+    CUDA_PATH="${key#*=}"
+    ;;
+    --with-openmp)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},OpenMP"
+    ;;
+    --with-pthread)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},Pthread"
+    ;;
+    --with-serial)
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},Serial"
+    ;;
+    --with-devices*)
+    DEVICES="${key#*=}"
+    KOKKOS_DEVICES="${KOKKOS_DEVICES},${DEVICES}"
+    ;;
+    --with-gtest*)
+    GTEST_PATH="${key#*=}"
+    ;;
+    --with-hwloc*)
+    HWLOC_PATH="${key#*=}"
+    ;;
+    --arch*)
+    KOKKOS_ARCH="${key#*=}"
+    ;;
+    --cxxflags*)
+    CXXFLAGS="${key#*=}"
+    ;;
+    --ldflags*)
+    LDFLAGS="${key#*=}"
+    ;;
+    --debug|-dbg)
+    KOKKOS_DEBUG=yes
+    ;;
+    --compiler*)
+    COMPILER="${key#*=}"
+    ;;
+    --help)
+    echo "Kokkos configure options:"
+    echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
+    echo ""
+    echo "--with-cuda[=/Path/To/Cuda]: enable Cuda and set path to Cuda Toolkit"
+    echo "--with-openmp:               enable OpenMP backend"
+    echo "--with-pthread:              enable Pthreads backend"
+    echo "--with-serial:               enable Serial backend"
+    echo "--with-devices:              explicitly add a set of backends"
+    echo ""
+    echo "--arch=[OPTIONS]:            set target architectures. Options are:"
+    echo "                               SNB = Intel Sandy/Ivy Bridge CPUs"
+    echo "                               HSW = Intel Haswell CPUs"
+    echo "                               KNC = Intel Knights Corner Xeon Phi"
+    echo "                               Kepler30  = NVIDIA Kepler generation CC 3.0"
+    echo "                               Kepler35  = NVIDIA Kepler generation CC 3.5"
+    echo "                               Kepler37  = NVIDIA Kepler generation CC 3.7"
+    echo "                               Maxwell50 = NVIDIA Maxwell generation CC 5.0"
+    echo "                               Power8 = IBM Power 8 CPUs"
+    echo ""
+    echo "--compiler=/Path/To/Compiler set the compiler"
+    echo "--debug,-dbg:                enable Debugging"
+    echo "--cxxflags=[FLAGS]           overwrite CXXFLAGS for library build and test build"
+    echo "                               This will still set certain required flags via"
+    echo "                               KOKKOS_CXXFLAGS (such as -fopenmp, --std=c++11, etc.)"
+    echo "--ldflags=[FLAGS]            overwrite LDFLAGS for library build and test build"
+    echo "                               This will still set certain required flags via"
+    echo "                               KOKKOS_LDFLAGS (such as -fopenmp, -lpthread, etc.)"
+    echo "--with-gtest=/Path/To/Gtest: set path to gtest (used in unit and performance tests"
+    echo "--with-hwloc=/Path/To/Hwloc: set path to hwloc"
+    exit 0
+    ;;
+    *)
+            # unknown option
+    ;;
+esac
+shift
+done
+
+# If KOKKOS_PATH undefined, assume parent dir of this
+# script is the KOKKOS_PATH
+if [ -z "$KOKKOS_PATH" ]; then
+    KOKKOS_PATH=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
+else
+    # Ensure KOKKOS_PATH is abs path
+    KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
+fi
+
+KOKKOS_OPTIONS="KOKKOS_PATH=${KOKKOS_PATH}"
+
+if [ ${#COMPILER} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} CXX=${COMPILER}"
+fi
+if [ ${#PREFIX} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} PREFIX=${PREFIX}"
+fi
+if [ ${#KOKKOS_DEVICES} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_DEVICES=${KOKKOS_DEVICES}"
+fi
+if [ ${#KOKKOS_ARCH} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_ARCH=${KOKKOS_ARCH}"
+fi
+if [ ${#KOKKOS_DEBUG} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} KOKKOS_DEBUG=${KOKKOS_DEBUG}"
+fi
+if [ ${#CUDA_PATH} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} CUDA_PATH=${CUDA_PATH}"
+fi
+if [ ${#CXXFLAGS} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} CXXFLAGS=\"${CXXFLAGS}\""
+fi
+if [ ${#LDFLAGS} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} LDFLAGS=\"${LDFLAGS}\""
+fi
+if [ ${#GTEST_PATH} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} GTEST_PATH=${GTEST_PATH}"
+else
+GTEST_PATH=${KOKKOS_PATH}/tpls/gtest
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} GTEST_PATH=${GTEST_PATH}"
+fi
+if [ ${#HWLOC_PATH} -gt 0 ]; then
+KOKKOS_OPTIONS="${KOKKOS_OPTIONS} HWLOC_PATH=${HWLOC_PATH} KOKKOS_USE_TPLS=hwloc"
+fi
+mkdir core
+mkdir core/unit_test
+mkdir core/perf_test
+mkdir containers
+mkdir containers/unit_tests
+mkdir containers/performance_tests
+mkdir algorithms
+mkdir algorithms/unit_tests
+mkdir algorithms/performance_tests
+mkdir example
+mkdir example/fixture
+mkdir example/feint
+mkdir example/fenl
+
+
+echo "Generating Makefile with options " ${KOKKOS_OPTIONS}
+echo "KOKKOS_OPTIONS=${KOKKOS_OPTIONS}" > Makefile
+echo "" >> Makefile
+echo "lib:" >> Makefile
+echo -e "\tcd core; \\" >> Makefile
+echo -e "\tmake -j -j -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_OPTIONS}" >> Makefile
+echo "" >> Makefile
+echo "install: lib" >> Makefile
+echo -e "\tcd core; \\" >> Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/core/src/Makefile ${KOKKOS_OPTIONS} install" >> Makefile
+echo "" >> Makefile
+echo "build-test:" >> Makefile
+echo -e "\tcd core/unit_test; \\" >> Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_OPTIONS}" >> Makefile
+echo -e "\tcd core/perf_test; \\" >> Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_OPTIONS}" >> Makefile
+echo -e "\tcd containers/unit_tests; \\" >> Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_OPTIONS}" >> Makefile
+echo -e "\tcd containers/performance_tests; \\" >> Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_OPTIONS}" >> Makefile
+echo -e "\tcd algorithms/unit_tests; \\" >> Makefile
+echo -e "\tmake -j -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_OPTIONS}" >> Makefile
+echo -e "\tcd example/fixture; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_OPTIONS}" >> Makefile
+echo -e "\tcd example/feint; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_OPTIONS}" >> Makefile
+echo -e "\tcd example/fenl; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_OPTIONS}" >> Makefile
+echo "" >> Makefile
+echo "test: build-test" >> Makefile
+echo -e "\tcd core/unit_test; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/core/unit_test/Makefile ${KOKKOS_OPTIONS} test" >> Makefile
+echo -e "\tcd core/perf_test; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/core/perf_test/Makefile ${KOKKOS_OPTIONS} test" >> Makefile
+echo -e "\tcd containers/unit_tests; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/containers/unit_tests/Makefile ${KOKKOS_OPTIONS} test" >> Makefile
+echo -e "\tcd containers/performance_tests; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/containers/performance_tests/Makefile ${KOKKOS_OPTIONS} test" >> Makefile
+echo -e "\tcd algorithms/unit_tests; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/algorithms/unit_tests/Makefile ${KOKKOS_OPTIONS} test" >> Makefile
+echo -e "\tcd example/fixture; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/fixture/Makefile ${KOKKOS_OPTIONS} test" >> Makefile
+echo -e "\tcd example/feint; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/feint/Makefile ${KOKKOS_OPTIONS} test" >> Makefile
+echo -e "\tcd example/fenl; \\" >> Makefile
+echo -e "\tmake -f ${KOKKOS_PATH}/example/fenl/Makefile ${KOKKOS_OPTIONS} test" >> Makefile
+
+